rm(list=ls(all=t))

Setup filenames

filename <- "Costa Rica_Public Use" # !!!Update filename
functions_vers <-  "functions_1.6.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)
#mydata <- mydata [1:10,] # remove '#' from #mydata if you want to conduct a fast check on 10 rows. 

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables

Direct PII: variables to be removed

# !!!No Direct PII

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

!!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables="b_entrevistador")
## [1] "Frequency table before encoding"
## b_entrevistador. Interviewer/Enumerator
##                                              Brenda Hernandez           Diana Quiros Jimenez 
##                            400                             67                             87 
##         Estefania Bonilla Vega      Luis Daniel Rosales Arias Meyling Berroteran Largaespada 
##                             87                             57                             96 
##           Tatiana Urena Picado      Yahaira Cisneros Alvarado         Yuliana Castro Herrera 
##                             89                             47                             21 
## [1] "Frequency table after encoding"
## b_entrevistador. Interviewer/Enumerator
##   1   2   3   4   5   6   7   8   9 
## 400  67  87  87  57  96  89  47  21

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("districtnum") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## districtnum. District Number, Numeric
##                 Alajuelita                  Arancibia                  Aserr<ed>                  Barbacoas 
##                          7                          6                          2                          1 
##                    Bolivar                Buena Vista                      Cachi                    Cahuita 
##                          1                          1                          5                          6 
##               Calle Zamora                 Capellades                     Cedral                    Central 
##                          3                          2                          1                          4 
##                      Chira                     Chires                     Chomes                  Cure<f1>a 
##                          9                          1                          6                          3 
##               Desamparados               Dulce Nombre                  Florencia                    Fortuna 
##                          2                          1                          1                          4 
##                  Gravilias                    Guacimo                 Guanacaste                     Guatil 
##                          4                          2                          1                          1 
##                    Hatillo             Hatillo Centro                    Higuito                   Hojancha 
##                          2                          1                          2                          5 
##                  Horquetas                     Huacas        Huetar Atl<e1>ntico               Huetar Norte 
##                          9                          1                          1                          1 
##                       Ipis                 Isla Chira                Isla Venado                 Jim<e9>nez 
##                          1                          5                          2                          1 
##                  La Carpio                    La Cruz                La Uni<f3>n                  La Virgen 
##                          3                          2                          7                          1 
##                    La cruz                  La granja                La uni<f3>n                    Lepanto 
##                         11                          1                          2                         23 
##               Le<f3>n XIII                    Liberia                 Los Cerros                  Los Guido 
##                          1                         11                          1                          1 
##                  Los guido                 Manzanillo                    Miramar              Montes de Oro 
##                          1                          5                          1                          3 
##                    Morales                     Nicoya                   Nosarita                    Pacayas 
##                          1                          3                          1                          1 
##                  Palmichal                      Pavas           Pe<f1>as blancas                    Primero 
##                          2                         13                          1                          1 
##                     Purral                Quebradilla                R<ed>o Azul                 San Carlos 
##                          7                          2                          1                          1 
##                 San Felipe              San Francisco San Francisco de 2 R<ed>os                San Gabriel 
##                          9                          1                          2                          1 
##                San Ignacio                 San Isidro                   San Juan           San Juan de Dios 
##                          5                          1                          2                          4 
##                   San Luis                 San Miguel                  San Pablo                  San Pedro 
##                          4                         55                         10                          2 
##   San Pedro. Montes de Oca                 San Rafael              San Sebastian                   San luis 
##                          1                          7                          1                          1 
##                  San pablo              Santa Cecilia                    Sixaola                    Tablazo 
##                          1                         14                         15                          1 
##                   Tib<e1>s                   Tirrases                 Tucurrique                 Turrubares 
##                          1                         12                         19                          5 
##                   Ure<f1>a                Villa Jaris                 <c1>ngeles                       <NA> 
##                          2                          1                          1                        560 
## [1] "Frequency table after encoding"
## districtnum. District Number, Numeric
##  296  297  298  299  300  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318 
##    1    7    2   15    2    1    5   12    6    4    5    1    2    1    3    4    7    1   19    1    1    2    4 
##  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336  337  338  339  340  341 
##    1    1    1   13    1    1    2    1    2    5    6    1   14    1    1    1    2    1    2    9    1    1    1 
##  342  343  344  345  346  347  348  349  350  351  352  353  354  355  356  357  358  359  360  361  362  363  364 
##    1    1    5    1    9   10    3   23    1    3   55    2    1    1    1    9    1   11    1    1    1    5    2 
##  365  366  367  368  369  370  371  372  373  374  375  376  377  378  379  380  381  382  383  384  385  386 <NA> 
##    3    1    2    4    1    1    1    1    5    2    1    6    1    1    7   11    2    4    2    2    3    7  560

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" of 10 or less. 

break_edu <- c(1,2,3,4,5,7,777,888)
labels_edu <- c("1. None" =1, 
                "2. Don't know" =2, 
                "3. Primary school" =3, 
                "4. Secondary school" =4, 
                "5. Technical or vocational school or University" =5)
mydata <- ordinal_recode (variable="hhheadeduc", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## hhheadeduc. Hh Head Level Of Education
##                           1. None                     2. Don't know                 3. Primary school 
##                               153                                10                               649 
##               4. Secondary school 5. Technical or vocational school                     6. University 
##                               110                                 6                                13 
##                              <NA> 
##                                10 
##    recoded
##     [1,2) [2,3) [3,4) [4,5) [5,7) [7,777) [777,888) [888,1e+06)
##   1   153     0     0     0     0       0         0           0
##   2     0    10     0     0     0       0         0           0
##   3     0     0   649     0     0       0         0           0
##   4     0     0     0   110     0       0         0           0
##   5     0     0     0     0     6       0         0           0
##   6     0     0     0     0    13       0         0           0
## [1] "Frequency table after encoding"
## hhheadeduc. Hh Head Level Of Education
##                                         1. None                                   2. Don't know 
##                                             153                                              10 
##                               3. Primary school                             4. Secondary school 
##                                             649                                             110 
## 5. Technical or vocational school or University                                            <NA> 
##                                              19                                              10 
## [1] "Inspect value labels and relabel as necessary"
##                                         1. None                                   2. Don't know 
##                                               1                                               2 
##                               3. Primary school                             4. Secondary school 
##                                               3                                               4 
## 5. Technical or vocational school or University 
##                                               5
mydata <- ordinal_recode (variable="e_hhheadeduc", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## e_hhheadeduc. Hh Head Level Of Education
##                           1. None                     2. Don't know                 3. Primary school 
##                                46                                 8                               300 
##               4. Secondary school 5. Technical or vocational school                     6. University 
##                                36                                 1                                 8 
##                              <NA> 
##                               552 
##    recoded
##     [1,2) [2,3) [3,4) [4,5) [5,7) [7,777) [777,888) [888,1e+06)
##   1    46     0     0     0     0       0         0           0
##   2     0     8     0     0     0       0         0           0
##   3     0     0   300     0     0       0         0           0
##   4     0     0     0    36     0       0         0           0
##   5     0     0     0     0     1       0         0           0
##   6     0     0     0     0     8       0         0           0
## [1] "Frequency table after encoding"
## e_hhheadeduc. Hh Head Level Of Education
##                                         1. None                                   2. Don't know 
##                                              46                                               8 
##                               3. Primary school                             4. Secondary school 
##                                             300                                              36 
## 5. Technical or vocational school or University                                            <NA> 
##                                               9                                             552 
## [1] "Inspect value labels and relabel as necessary"
##                                         1. None                                   2. Don't know 
##                                               1                                               2 
##                               3. Primary school                             4. Secondary school 
##                                               3                                               4 
## 5. Technical or vocational school or University 
##                                               5
break_edu <- c(0,1,2,3,5)
labels_edu <- c("0. None" =1, 
                "1. Completed primary" =2, 
                "2. Completed secondary" =3, 
                "3. Completed Technical or University" =4, 
                "4. Don't Know" =5)
mydata <- ordinal_recode (variable="e_p8", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## e_p8. What Is The Highest Level Of Schooling Completed Of The Head Of The Household?
##                 0. None    1. Completed primary  2. Completed secondary  3. Completed technical 
##                      46                     300                      36                       1 
## 4. Completed university           5. Don_t know                    <NA> 
##                       8                       8                     552 
##    recoded
##     [0,1) [1,2) [2,3) [3,5) [5,1e+06)
##   0    46     0     0     0         0
##   1     0   300     0     0         0
##   2     0     0    36     0         0
##   3     0     0     0     1         0
##   4     0     0     0     8         0
##   5     0     0     0     0         8
## [1] "Frequency table after encoding"
## e_p8. What Is The Highest Level Of Schooling Completed Of The Head Of The Household?
##                              0. None                 1. Completed primary               2. Completed secondary 
##                                   46                                  300                                   36 
## 3. Completed Technical or University                        4. Don't Know                                 <NA> 
##                                    9                                    8                                  552 
## [1] "Inspect value labels and relabel as necessary"
##                              0. None                 1. Completed primary               2. Completed secondary 
##                                    1                                    2                                    3 
## 3. Completed Technical or University                        4. Don't Know 
##                                    4                                    5
# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## hhsize. Number Of Individuals In Household
##    1    2    3    4    5    6    7    8    9   10   11   12   13   15   21 <NA> 
##    3   57  209  259  182  115   60   19   16    9    5    4    1    1    1   10

## [1] "Frequency table after encoding"
## hhsize. Number Of Individuals In Household
##          1          2          3          4          5          6          7          8          9 10 or more 
##          3         57        209        259        182        115         60         19         16         21 
##       <NA> 
##         10

mydata <- top_recode ("b_hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## b_hhsize. Hh Size
##    2    3    4    5    6    7    8    9   10   11 <NA> 
##   31  121  151  116   69   34    8   13    6    1  401

## [1] "Frequency table after encoding"
## b_hhsize. Hh Size
##          2          3          4          5          6          7          8          9 10 or more       <NA> 
##         31        121        151        116         69         34          8         13          7        401

mydata <- top_recode ("e_hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## e_hhsize. Number Of Individuals In Household
##    1    2    3    4    5    6    7    8    9   10   11   12   13   15   21 <NA> 
##    3   27   89  110   67   47   26   11    4    4    4    4    1    1    1  552

## [1] "Frequency table after encoding"
## e_hhsize. Number Of Individuals In Household
##          1          2          3          4          5          6          7          8          9 10 or more 
##          3         27         89        110         67         47         26         11          4         15 
##       <NA> 
##        552

mydata <- top_recode ("b_q6", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## b_q6. How Many Brothers And Sisters Do You Have?
##    0    1    2    3    4    5    6    7    8    9   10   11   14 <NA> 
##   14  119  160   96   65   36   27   14    5    5    7    2    1  400

## [1] "Frequency table after encoding"
## b_q6. How Many Brothers And Sisters Do You Have?
##          0          1          2          3          4          5          6          7          8          9 
##         14        119        160         96         65         36         27         14          5          5 
## 10 or more       <NA> 
##         10        400

mydata <- top_recode ("b_q8", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## b_q8. How Many People, Including Yourself, Live In Your Household?
##    2    3    4    5    6    7    8    9   10   11 <NA> 
##   31  121  151  116   69   34    8   13    6    1  401

## [1] "Frequency table after encoding"
## b_q8. How Many People, Including Yourself, Live In Your Household?
##          2          3          4          5          6          7          8          9 10 or more       <NA> 
##         31        121        151        116         69         34          8         13          7        401

mydata <- top_recode ("e_p4", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## e_p4. How Many Brothers And Sisters Do You Have?
##    0    1    2    3    4    5    6    7    8    9   10   11   15 <NA> 
##    6   88  118   54   57   25   11   14    6    9    5    4    1  553

## [1] "Frequency table after encoding"
## e_p4. How Many Brothers And Sisters Do You Have?
##          0          1          2          3          4          5          6          7          8          9 
##          6         88        118         54         57         25         11         14          6          9 
## 10 or more       <NA> 
##         10        553

mydata <- top_recode ("e_p6", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## e_p6. How Many People, Including Yourself, Live In Your Household?
##    1    2    3    4    5    6    7    8    9   10   11   12   13   15   21 <NA> 
##    3   27   89  110   67   47   26   11    4    4    4    4    1    1    1  552

## [1] "Frequency table after encoding"
## e_p6. How Many People, Including Yourself, Live In Your Household?
##          1          2          3          4          5          6          7          8          9 10 or more 
##          3         27         89        110         67         47         26         11          4         15 
##       <NA> 
##        552

# Dates of birth removed, as strong identifier and ages already provided in separate variables

mydata <- mydata[!names(mydata) %in% c("b_q3a", "b_q3b", "b_q3c")]

# Combine technical and university education of head of household 

b_headtech_univ <- ifelse(mydata$b_headtech==1, 1,
                          ifelse(mydata$b_headuniv==1, 1,0))
mydata <- add_column(mydata, b_headtech_univ, .after = "b_headsec")

e_headtech_univ <- ifelse(mydata$e_headtech==1, 1,
                          ifelse(mydata$e_headuniv==1, 1,0))
mydata <- add_column(mydata, e_headtech_univ, .after = "e_headsec")

mydata <- mydata[!names(mydata) %in% c("b_headtech", "b_headuniv", "e_headtech", "e_headuniv")]

mydata$b_q10[mydata$b_q10=="Tecnico"] <- "Tecnico/Universidad"
mydata$b_q10[mydata$b_q10=="Universidad"] <- "Tecnico/Universidad"

# Encode school names

mydata <- encode_location (variables= c("b_q23", "e_p23ot"), missing=NA)
## [1] "Frequency table before encoding"
## b_q23. What Was The Name Of The Last School You Attended?
##                                                         Colegio de Cedros          Colegio Enrique Guier Saenz 
##                                  926                                    1                                    1 
##          Colegio Marco Tulio Salazar Colegio Nocturno Primario de Higuito                Colegio Rincon Grande 
##                                    1                                    1                                    2 
##       Colegio Rincon Grande de Pavas                       Condes Lourdes        Ctp Jose Maria Zeledon Brenes 
##                                    1                                    1                                    1 
##           Liceo Academico Isla Chira    Liceo Barrio San Jose de Alajuela                   Liceo de Alajuelta 
##                                    1                                    1                                    1 
##                     Liceo de higuito                     Liceo de Higuito                  Liceo de San Miguel 
##                                    1                                    2                                    1 
##     Liceo de San Miguel,desamparados            Liceo Enrique Guier Saenz                     Liceo Isla Chira 
##                                    1                                    1                                    1 
##                  Liceo Isla de Chira             Liceo Leon Cortes Castro     Liceo Monsenor Ruben Odio Herera 
##                                    1                                    1                                    1 
##                     Liceo San Miguel                  Monsenor ruben odio               rincon grande de pavas 
##                                    1                                    1                                    1 
## [1] "Frequency table after encoding"
## b_q23. What Was The Name Of The Last School You Attended?
## 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   1   2   1   1 926   1   1   1 
## [1] "Frequency table before encoding"
## e_p23ot. Other Name Of Last School Attended
##                                                                     ctp pacayas 
##                                     940                                       1 
##                       Ctp San sebastian                       escuela de higito 
##                                       1                                       1 
##                       instituto corpesa                        liceo alajuelita 
##                                       1                                       1 
##         Liceo de  higuito  Desamparados liceo esperimental biling?e de palmar?s 
##                                       1                                       1 
##          liceo nocturno de desamparados                 liceo rural isla venado 
##                                       1                                       1 
##               liceo t?cnico isla venado          sindea. puerto viejo sarapiqui 
##                                       1                                       1 
## [1] "Frequency table after encoding"
## e_p23ot. Other Name Of Last School Attended
## 572 573 574 575 576 577 578 579 580 581 582 583 
##   1 940   1   1   1   1   1   1   1   1   1   1
# Recode "apartamento" into "otro". 

mydata$b_q45[mydata$b_q45=="Apartamento"] <- "Otro"

# !!!Include relevant variables in list below

indirect_PII <- c("hhhead", 
                  "hhheadgrand", 
                  "hhheadsib", 
                  "hhheadauntuncle", 
                  "hhheadnon", 
                  "hhheadself", 
                  "attend", 
                  "attend90", 
                  "q36c", 
                  "q36f", 
                  "b_schdrop_bl", 
                  "b_q7", 
                  "b_q9", 
                  "b_q10", 
                  "b_q12", 
                  "b_q20", 
                  "b_q23", 
                  "b_q30", 
                  "b_q34", 
                  "b_q39", 
                  "b_q45", 
                  "b_q46", 
                  "e_p23ot")

capture_tables (indirect_PII)

# Recode those with very specific values where more than half of the sample have actual data. 

mydata <- encode_direct_PII_team (variables="b_schdrop_bl")
## [1] "Frequency table before encoding"
## b_schdrop_bl. Name Of Last School Attended
##                                                         Colegio de Cedros          Colegio Enrique Guier Saenz 
##                                  926                                    1                                    1 
##          Colegio Marco Tulio Salazar Colegio Nocturno Primario de Higuito                Colegio Rincon Grande 
##                                    1                                    1                                    2 
##       Colegio Rincon Grande de Pavas                       Condes Lourdes        Ctp Jose Maria Zeledon Brenes 
##                                    1                                    1                                    1 
##           Liceo Academico Isla Chira    Liceo Barrio San Jose de Alajuela                   Liceo de Alajuelta 
##                                    1                                    1                                    1 
##                     Liceo de higuito                     Liceo de Higuito                  Liceo de San Miguel 
##                                    1                                    2                                    1 
##     Liceo de San Miguel,desamparados            Liceo Enrique Guier Saenz                     Liceo Isla Chira 
##                                    1                                    1                                    1 
##                  Liceo Isla de Chira             Liceo Leon Cortes Castro     Liceo Monsenor Ruben Odio Herera 
##                                    1                                    1                                    1 
##                     Liceo San Miguel                  Monsenor ruben odio               rincon grande de pavas 
##                                    1                                    1                                    1 
## [1] "Frequency table after encoding"
## b_schdrop_bl. Name Of Last School Attended
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24 
## 926   1   1   1   1   2   1   1   1   1   1   1   1   2   1   1   1   1   1   1   1   1   1   1

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('male', 'hhheadeduc', 'ageyears') ##!!! Replace with candidate categorical demo vars

# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 951 rows and 662 variables.
##   --> Categorical key variables: male, hhheadeduc, ageyears
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories     Mean size           Size of smallest (>0)      
##          male                    3 (3)   471.000 (471.000)                   415 (415)
##    hhheadeduc                    6 (6)   188.200 (188.200)                    10  (10)
##      ageyears                    8 (8)   134.571 (134.571)                    17  (17)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 7 (0.736%)
##   - 3-anonymity: 23 (2.419%)
##   - 5-anonymity: 43 (4.522%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 7 x 3
##            male                                          hhheadeduc ageyears
##       <dbl+lbl>                                           <dbl+lbl>    <dbl>
## 1 1 [1. Male]   2 [2. Don't know]                                         13
## 2 1 [1. Male]   5 [5. Technical or vocational school or University]       13
## 3 0 [0. Female] 2 [2. Don't know]                                         18
## 4 0 [0. Female] 2 [2. Don't know]                                         16
## 5 1 [1. Male]   2 [2. Don't know]                                         15
## 6 1 [1. Male]   5 [5. Technical or vocational school or University]       15
## 7 1 [1. Male]   2 [2. Don't know]                                         18
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
##     male hhheadeduc ageyears
## 108    1          2       NA
## 296    1          5       NA
## 323    0          2       NA
## 411    0         NA       16
## 466    1          2       NA
## 744    1          5       NA
## 788    1          2       NA
hheduc <- c("hhheadeduc",
           "b_hhheadeduc",
           "b_headnoeduc",
           "b_headdkeduc",
           "b_headprim",
           "b_headsec",
           "b_headtech",
           "b_headuniv",
           "e_hhheadeduc",
           "e_headnoeduc",
           "e_headdkeduc",
           "e_headprim",
           "e_headsec",
           "e_headtech",
           "e_headuniv")
mydata [notAnon, hheduc] <- NA
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 951 rows and 666 variables.
##   --> Categorical key variables: male, hhheadeduc, ageyears
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories     Mean size           Size of smallest (>0)      
##          male                    3 (3)   471.000 (471.000)                   415 (415)
##    hhheadeduc                    6 (6)   186.800 (186.800)                     5   (5)
##      ageyears                    8 (8)   134.571 (134.571)                    17  (17)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 0 (0.000%)
##   - 3-anonymity: 12 (1.262%)
##   - 5-anonymity: 28 (2.944%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("b_q30oth")
report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata <- mydata[!names(mydata) %in% "b_q30oth"] # Removed, verbatim response in Spanish with high reidentification risk

GPS data: Displace

# !!! No GPS

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)