rm(list=ls(all=t))
filename <- "Costa Rica_Public Use" # !!!Update filename
functions_vers <- "functions_1.6.R" # !!!Update helper functions file
source (functions_vers)
#mydata <- mydata [1:10,] # remove '#' from #mydata if you want to conduct a fast check on 10 rows.
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables
# !!!No Direct PII
!!!Replace vector in "variables" field below with relevant variable names
# Encode Direct PII-team
mydata <- encode_direct_PII_team (variables="b_entrevistador")
## [1] "Frequency table before encoding"
## b_entrevistador. Interviewer/Enumerator
## Brenda Hernandez Diana Quiros Jimenez
## 400 67 87
## Estefania Bonilla Vega Luis Daniel Rosales Arias Meyling Berroteran Largaespada
## 87 57 96
## Tatiana Urena Picado Yahaira Cisneros Alvarado Yuliana Castro Herrera
## 89 47 21
## [1] "Frequency table after encoding"
## b_entrevistador. Interviewer/Enumerator
## 1 2 3 4 5 6 7 8 9
## 400 67 87 87 57 96 89 47 21
!!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("districtnum")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## districtnum. District Number, Numeric
## Alajuelita Arancibia Aserr<ed> Barbacoas
## 7 6 2 1
## Bolivar Buena Vista Cachi Cahuita
## 1 1 5 6
## Calle Zamora Capellades Cedral Central
## 3 2 1 4
## Chira Chires Chomes Cure<f1>a
## 9 1 6 3
## Desamparados Dulce Nombre Florencia Fortuna
## 2 1 1 4
## Gravilias Guacimo Guanacaste Guatil
## 4 2 1 1
## Hatillo Hatillo Centro Higuito Hojancha
## 2 1 2 5
## Horquetas Huacas Huetar Atl<e1>ntico Huetar Norte
## 9 1 1 1
## Ipis Isla Chira Isla Venado Jim<e9>nez
## 1 5 2 1
## La Carpio La Cruz La Uni<f3>n La Virgen
## 3 2 7 1
## La cruz La granja La uni<f3>n Lepanto
## 11 1 2 23
## Le<f3>n XIII Liberia Los Cerros Los Guido
## 1 11 1 1
## Los guido Manzanillo Miramar Montes de Oro
## 1 5 1 3
## Morales Nicoya Nosarita Pacayas
## 1 3 1 1
## Palmichal Pavas Pe<f1>as blancas Primero
## 2 13 1 1
## Purral Quebradilla R<ed>o Azul San Carlos
## 7 2 1 1
## San Felipe San Francisco San Francisco de 2 R<ed>os San Gabriel
## 9 1 2 1
## San Ignacio San Isidro San Juan San Juan de Dios
## 5 1 2 4
## San Luis San Miguel San Pablo San Pedro
## 4 55 10 2
## San Pedro. Montes de Oca San Rafael San Sebastian San luis
## 1 7 1 1
## San pablo Santa Cecilia Sixaola Tablazo
## 1 14 15 1
## Tib<e1>s Tirrases Tucurrique Turrubares
## 1 12 19 5
## Ure<f1>a Villa Jaris <c1>ngeles <NA>
## 2 1 1 560
## [1] "Frequency table after encoding"
## districtnum. District Number, Numeric
## 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
## 1 7 2 15 2 1 5 12 6 4 5 1 2 1 3 4 7 1 19 1 1 2 4
## 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
## 1 1 1 13 1 1 2 1 2 5 6 1 14 1 1 1 2 1 2 9 1 1 1
## 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
## 1 1 5 1 9 10 3 23 1 3 55 2 1 1 1 9 1 11 1 1 1 5 2
## 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 <NA>
## 3 1 2 4 1 1 1 1 5 2 1 6 1 1 7 11 2 4 2 2 3 7 560
# Focus on variables with a "Lowest Freq" of 10 or less.
break_edu <- c(1,2,3,4,5,7,777,888)
labels_edu <- c("1. None" =1,
"2. Don't know" =2,
"3. Primary school" =3,
"4. Secondary school" =4,
"5. Technical or vocational school or University" =5)
mydata <- ordinal_recode (variable="hhheadeduc", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## hhheadeduc. Hh Head Level Of Education
## 1. None 2. Don't know 3. Primary school
## 153 10 649
## 4. Secondary school 5. Technical or vocational school 6. University
## 110 6 13
## <NA>
## 10
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,7) [7,777) [777,888) [888,1e+06)
## 1 153 0 0 0 0 0 0 0
## 2 0 10 0 0 0 0 0 0
## 3 0 0 649 0 0 0 0 0
## 4 0 0 0 110 0 0 0 0
## 5 0 0 0 0 6 0 0 0
## 6 0 0 0 0 13 0 0 0
## [1] "Frequency table after encoding"
## hhheadeduc. Hh Head Level Of Education
## 1. None 2. Don't know
## 153 10
## 3. Primary school 4. Secondary school
## 649 110
## 5. Technical or vocational school or University <NA>
## 19 10
## [1] "Inspect value labels and relabel as necessary"
## 1. None 2. Don't know
## 1 2
## 3. Primary school 4. Secondary school
## 3 4
## 5. Technical or vocational school or University
## 5
mydata <- ordinal_recode (variable="e_hhheadeduc", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## e_hhheadeduc. Hh Head Level Of Education
## 1. None 2. Don't know 3. Primary school
## 46 8 300
## 4. Secondary school 5. Technical or vocational school 6. University
## 36 1 8
## <NA>
## 552
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,7) [7,777) [777,888) [888,1e+06)
## 1 46 0 0 0 0 0 0 0
## 2 0 8 0 0 0 0 0 0
## 3 0 0 300 0 0 0 0 0
## 4 0 0 0 36 0 0 0 0
## 5 0 0 0 0 1 0 0 0
## 6 0 0 0 0 8 0 0 0
## [1] "Frequency table after encoding"
## e_hhheadeduc. Hh Head Level Of Education
## 1. None 2. Don't know
## 46 8
## 3. Primary school 4. Secondary school
## 300 36
## 5. Technical or vocational school or University <NA>
## 9 552
## [1] "Inspect value labels and relabel as necessary"
## 1. None 2. Don't know
## 1 2
## 3. Primary school 4. Secondary school
## 3 4
## 5. Technical or vocational school or University
## 5
break_edu <- c(0,1,2,3,5)
labels_edu <- c("0. None" =1,
"1. Completed primary" =2,
"2. Completed secondary" =3,
"3. Completed Technical or University" =4,
"4. Don't Know" =5)
mydata <- ordinal_recode (variable="e_p8", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## e_p8. What Is The Highest Level Of Schooling Completed Of The Head Of The Household?
## 0. None 1. Completed primary 2. Completed secondary 3. Completed technical
## 46 300 36 1
## 4. Completed university 5. Don_t know <NA>
## 8 8 552
## recoded
## [0,1) [1,2) [2,3) [3,5) [5,1e+06)
## 0 46 0 0 0 0
## 1 0 300 0 0 0
## 2 0 0 36 0 0
## 3 0 0 0 1 0
## 4 0 0 0 8 0
## 5 0 0 0 0 8
## [1] "Frequency table after encoding"
## e_p8. What Is The Highest Level Of Schooling Completed Of The Head Of The Household?
## 0. None 1. Completed primary 2. Completed secondary
## 46 300 36
## 3. Completed Technical or University 4. Don't Know <NA>
## 9 8 552
## [1] "Inspect value labels and relabel as necessary"
## 0. None 1. Completed primary 2. Completed secondary
## 1 2 3
## 3. Completed Technical or University 4. Don't Know
## 4 5
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## hhsize. Number Of Individuals In Household
## 1 2 3 4 5 6 7 8 9 10 11 12 13 15 21 <NA>
## 3 57 209 259 182 115 60 19 16 9 5 4 1 1 1 10
## [1] "Frequency table after encoding"
## hhsize. Number Of Individuals In Household
## 1 2 3 4 5 6 7 8 9 10 or more
## 3 57 209 259 182 115 60 19 16 21
## <NA>
## 10
mydata <- top_recode ("b_hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_hhsize. Hh Size
## 2 3 4 5 6 7 8 9 10 11 <NA>
## 31 121 151 116 69 34 8 13 6 1 401
## [1] "Frequency table after encoding"
## b_hhsize. Hh Size
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 31 121 151 116 69 34 8 13 7 401
mydata <- top_recode ("e_hhsize", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## e_hhsize. Number Of Individuals In Household
## 1 2 3 4 5 6 7 8 9 10 11 12 13 15 21 <NA>
## 3 27 89 110 67 47 26 11 4 4 4 4 1 1 1 552
## [1] "Frequency table after encoding"
## e_hhsize. Number Of Individuals In Household
## 1 2 3 4 5 6 7 8 9 10 or more
## 3 27 89 110 67 47 26 11 4 15
## <NA>
## 552
mydata <- top_recode ("b_q6", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_q6. How Many Brothers And Sisters Do You Have?
## 0 1 2 3 4 5 6 7 8 9 10 11 14 <NA>
## 14 119 160 96 65 36 27 14 5 5 7 2 1 400
## [1] "Frequency table after encoding"
## b_q6. How Many Brothers And Sisters Do You Have?
## 0 1 2 3 4 5 6 7 8 9
## 14 119 160 96 65 36 27 14 5 5
## 10 or more <NA>
## 10 400
mydata <- top_recode ("b_q8", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_q8. How Many People, Including Yourself, Live In Your Household?
## 2 3 4 5 6 7 8 9 10 11 <NA>
## 31 121 151 116 69 34 8 13 6 1 401
## [1] "Frequency table after encoding"
## b_q8. How Many People, Including Yourself, Live In Your Household?
## 2 3 4 5 6 7 8 9 10 or more <NA>
## 31 121 151 116 69 34 8 13 7 401
mydata <- top_recode ("e_p4", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## e_p4. How Many Brothers And Sisters Do You Have?
## 0 1 2 3 4 5 6 7 8 9 10 11 15 <NA>
## 6 88 118 54 57 25 11 14 6 9 5 4 1 553
## [1] "Frequency table after encoding"
## e_p4. How Many Brothers And Sisters Do You Have?
## 0 1 2 3 4 5 6 7 8 9
## 6 88 118 54 57 25 11 14 6 9
## 10 or more <NA>
## 10 553
mydata <- top_recode ("e_p6", break_point=10, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## e_p6. How Many People, Including Yourself, Live In Your Household?
## 1 2 3 4 5 6 7 8 9 10 11 12 13 15 21 <NA>
## 3 27 89 110 67 47 26 11 4 4 4 4 1 1 1 552
## [1] "Frequency table after encoding"
## e_p6. How Many People, Including Yourself, Live In Your Household?
## 1 2 3 4 5 6 7 8 9 10 or more
## 3 27 89 110 67 47 26 11 4 15
## <NA>
## 552
# Dates of birth removed, as strong identifier and ages already provided in separate variables
mydata <- mydata[!names(mydata) %in% c("b_q3a", "b_q3b", "b_q3c")]
# Combine technical and university education of head of household
b_headtech_univ <- ifelse(mydata$b_headtech==1, 1,
ifelse(mydata$b_headuniv==1, 1,0))
mydata <- add_column(mydata, b_headtech_univ, .after = "b_headsec")
e_headtech_univ <- ifelse(mydata$e_headtech==1, 1,
ifelse(mydata$e_headuniv==1, 1,0))
mydata <- add_column(mydata, e_headtech_univ, .after = "e_headsec")
mydata <- mydata[!names(mydata) %in% c("b_headtech", "b_headuniv", "e_headtech", "e_headuniv")]
mydata$b_q10[mydata$b_q10=="Tecnico"] <- "Tecnico/Universidad"
mydata$b_q10[mydata$b_q10=="Universidad"] <- "Tecnico/Universidad"
# Encode school names
mydata <- encode_location (variables= c("b_q23", "e_p23ot"), missing=NA)
## [1] "Frequency table before encoding"
## b_q23. What Was The Name Of The Last School You Attended?
## Colegio de Cedros Colegio Enrique Guier Saenz
## 926 1 1
## Colegio Marco Tulio Salazar Colegio Nocturno Primario de Higuito Colegio Rincon Grande
## 1 1 2
## Colegio Rincon Grande de Pavas Condes Lourdes Ctp Jose Maria Zeledon Brenes
## 1 1 1
## Liceo Academico Isla Chira Liceo Barrio San Jose de Alajuela Liceo de Alajuelta
## 1 1 1
## Liceo de higuito Liceo de Higuito Liceo de San Miguel
## 1 2 1
## Liceo de San Miguel,desamparados Liceo Enrique Guier Saenz Liceo Isla Chira
## 1 1 1
## Liceo Isla de Chira Liceo Leon Cortes Castro Liceo Monsenor Ruben Odio Herera
## 1 1 1
## Liceo San Miguel Monsenor ruben odio rincon grande de pavas
## 1 1 1
## [1] "Frequency table after encoding"
## b_q23. What Was The Name Of The Last School You Attended?
## 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 926 1 1 1
## [1] "Frequency table before encoding"
## e_p23ot. Other Name Of Last School Attended
## ctp pacayas
## 940 1
## Ctp San sebastian escuela de higito
## 1 1
## instituto corpesa liceo alajuelita
## 1 1
## Liceo de higuito Desamparados liceo esperimental biling?e de palmar?s
## 1 1
## liceo nocturno de desamparados liceo rural isla venado
## 1 1
## liceo t?cnico isla venado sindea. puerto viejo sarapiqui
## 1 1
## [1] "Frequency table after encoding"
## e_p23ot. Other Name Of Last School Attended
## 572 573 574 575 576 577 578 579 580 581 582 583
## 1 940 1 1 1 1 1 1 1 1 1 1
# Recode "apartamento" into "otro".
mydata$b_q45[mydata$b_q45=="Apartamento"] <- "Otro"
# !!!Include relevant variables in list below
indirect_PII <- c("hhhead",
"hhheadgrand",
"hhheadsib",
"hhheadauntuncle",
"hhheadnon",
"hhheadself",
"attend",
"attend90",
"q36c",
"q36f",
"b_schdrop_bl",
"b_q7",
"b_q9",
"b_q10",
"b_q12",
"b_q20",
"b_q23",
"b_q30",
"b_q34",
"b_q39",
"b_q45",
"b_q46",
"e_p23ot")
capture_tables (indirect_PII)
# Recode those with very specific values where more than half of the sample have actual data.
mydata <- encode_direct_PII_team (variables="b_schdrop_bl")
## [1] "Frequency table before encoding"
## b_schdrop_bl. Name Of Last School Attended
## Colegio de Cedros Colegio Enrique Guier Saenz
## 926 1 1
## Colegio Marco Tulio Salazar Colegio Nocturno Primario de Higuito Colegio Rincon Grande
## 1 1 2
## Colegio Rincon Grande de Pavas Condes Lourdes Ctp Jose Maria Zeledon Brenes
## 1 1 1
## Liceo Academico Isla Chira Liceo Barrio San Jose de Alajuela Liceo de Alajuelta
## 1 1 1
## Liceo de higuito Liceo de Higuito Liceo de San Miguel
## 1 2 1
## Liceo de San Miguel,desamparados Liceo Enrique Guier Saenz Liceo Isla Chira
## 1 1 1
## Liceo Isla de Chira Liceo Leon Cortes Castro Liceo Monsenor Ruben Odio Herera
## 1 1 1
## Liceo San Miguel Monsenor ruben odio rincon grande de pavas
## 1 1 1
## [1] "Frequency table after encoding"
## b_schdrop_bl. Name Of Last School Attended
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 926 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('male', 'hhheadeduc', 'ageyears') ##!!! Replace with candidate categorical demo vars
# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 951 rows and 662 variables.
## --> Categorical key variables: male, hhheadeduc, ageyears
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## male 3 (3) 471.000 (471.000) 415 (415)
## hhheadeduc 6 (6) 188.200 (188.200) 10 (10)
## ageyears 8 (8) 134.571 (134.571) 17 (17)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 7 (0.736%)
## - 3-anonymity: 23 (2.419%)
## - 5-anonymity: 43 (4.522%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 7 x 3
## male hhheadeduc ageyears
## <dbl+lbl> <dbl+lbl> <dbl>
## 1 1 [1. Male] 2 [2. Don't know] 13
## 2 1 [1. Male] 5 [5. Technical or vocational school or University] 13
## 3 0 [0. Female] 2 [2. Don't know] 18
## 4 0 [0. Female] 2 [2. Don't know] 16
## 5 1 [1. Male] 2 [2. Don't know] 15
## 6 1 [1. Male] 5 [5. Technical or vocational school or University] 15
## 7 1 [1. Male] 2 [2. Don't know] 18
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## male hhheadeduc ageyears
## 108 1 2 NA
## 296 1 5 NA
## 323 0 2 NA
## 411 0 NA 16
## 466 1 2 NA
## 744 1 5 NA
## 788 1 2 NA
hheduc <- c("hhheadeduc",
"b_hhheadeduc",
"b_headnoeduc",
"b_headdkeduc",
"b_headprim",
"b_headsec",
"b_headtech",
"b_headuniv",
"e_hhheadeduc",
"e_headnoeduc",
"e_headdkeduc",
"e_headprim",
"e_headsec",
"e_headtech",
"e_headuniv")
mydata [notAnon, hheduc] <- NA
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 951 rows and 666 variables.
## --> Categorical key variables: male, hhheadeduc, ageyears
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## male 3 (3) 471.000 (471.000) 415 (415)
## hhheadeduc 6 (6) 186.800 (186.800) 5 (5)
## ageyears 8 (8) 134.571 (134.571) 17 (17)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 12 (1.262%)
## - 5-anonymity: 28 (2.944%)
##
## ----------------------------------------------------------------------
# !!! Identify open-end variables here:
open_ends <- c("b_q30oth")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata <- mydata[!names(mydata) %in% "b_q30oth"] # Removed, verbatim response in Spanish with high reidentification risk
# !!! No GPS
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)