rm(list=ls(all=t))

Setup filenames

filename <- "Survey Data - Ecuador_Public Use" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#!!!Save flagged dictionary in .csv format, add "DatasetReview" to name and continue processing data with subset of flagged variables

Direct PII: variables to be removed

# !!!No direct PII

Direct PII-team: Encode field team names

# !!!No direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

#  !!!No small locations

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

# Recode education attainment of adults to reduce risk of re-identification 
# !!!No education attainment of adults.

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("b_v6", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## b_v6. How Many People, Including Yourself, Live In Your Household?
##    1    2    3    4    5    6    7    8    9   10   11   12   14   16   29 <NA> 
##    2   23   72  107  179  108   64   32   17   13    6    6    1    1    1   44

## [1] "Frequency table after encoding"
## b_v6. How Many People, Including Yourself, Live In Your Household?
##          1          2          3          4          5          6          7          8 
##          2         23         72        107        179        108         64         32 
##          9 10 or more       <NA> 
##         17         28         44

mydata <- top_recode ("b_hhsize_2016_masked", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_hhsize_2016_masked. Household Size
## 2 or less         3         4         5         6         7         8         9        10 
##        25        72       107       179       108        64        32        17        13 
##       11+      <NA> 
##        15        44

## [1] "Frequency table after encoding"
## b_hhsize_2016_masked. Household Size
##  2 or less          3          4          5          6          7          8          9 
##         25         72        107        179        108         64         32         17 
## 10 or more       <NA> 
##         28         44

mydata <- top_recode ("b_hhsize_2016_imp", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_hhsize_2016_imp. Imputed Version Of B_Hhsize_2016
##                1                2                3                4                5 
##                2               23               72              107              179 
## 5.40981006622314                6                7                8                9 
##               44              108               64               32               17 
##               10               11               12               14               16 
##               13                6                6                1                1 
##               29 
##                1

## [1] "Frequency table after encoding"
## b_hhsize_2016_imp. Imputed Version Of B_Hhsize_2016
##                1                2                3                4                5 
##                2               23               72              107              179 
## 5.40981006622314                6                7                8                9 
##               44              108               64               32               17 
##       10 or more 
##               28

# Top code high income to the 99.5 percentile
# No income variables to top code

Indirect PII - Categrical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("age_at_base",
                  "female",
                  "b_v2",
                  "b_v3_masked",
                  "b_v4",
                  "b_v5",
                  "b_v7",
                  "b_v8",
                  "b_v9",
                  "b_v10",
                  "b_v11",
                  "b_v12",
                  "b_v13",
                  "b_v20_1",
                  "b_v20_2",
                  "b_v20_3",
                  "b_v20_4",
                  "b_v20_5",
                  "b_v20_6",
                  "b_v20_7",
                  "b_v20_8",
                  "b_v20x1_masked",
                  "b_v21_1_masked",
                  "b_v21_2_masked",
                  "b_v21_3_masked",
                  "b_v21_4_masked",
                  "b_v21_5_masked",
                  "b_v21_6_masked",
                  "b_v21_7_masked",
                  "b_v22_1",
                  "b_v22_2",
                  "b_v22_3",
                  "b_v22_4",
                  "b_v23",
                  "b_v24",
                  "b_v25_1",
                  "b_v25_2",
                  "b_v25_3",
                  "b_v25_4",
                  "b_v25_5",
                  "b_v25_6",
                  "b_v25_7",
                  "b_v25_8",
                  "b_v25_9",
                  "b_v25_10",
                  "b_v26_1_masked",
                  "b_v26_2_masked",
                  "b_v26_3_masked",
                  "b_v26_4_masked",
                  "b_v26_5_masked",
                  "b_v26_6_masked",
                  "b_v26_7_masked",
                  "b_v27_1",
                  "b_v27_2",
                  "b_v27_3",
                  "b_v27_4",
                  "b_v28_masked",
                  "b_v29_1",
                  "b_v29_2",
                  "b_v29_3",
                  "b_v29_4",
                  "b_v29_5",
                  "b_v29_6",
                  "b_v29_7",
                  "b_v29_8",
                  "b_v29_9",
                  "b_v29_10",
                  "b_v29_11",
                  "b_v29_13",
                  "b_v29_18",
                  "b_v29_19",
                  "b_v29_masked",
                  "b_v29_other",
                  "b_v30_1",
                  "b_v30_2",
                  "b_v30_3",
                  "b_v30_4",
                  "b_v30_5",
                  "b_v30_6",
                  "b_v30_7",
                  "b_v30_8",
                  "b_v30_9",
                  "b_v30_10",
                  "b_v30_11",
                  "b_v30_other",
                  "b_v31",
                  "b_v32",
                  "b_v32_other",
                  "b_v33",
                  "b_v34",
                  "b_v35_1",
                  "b_v35_2",
                  "b_v35_3",
                  "b_v35_4",
                  "b_v35_5",
                  "b_v35_6",
                  "b_v35_7",
                  "b_v35_8",
                  "b_v35_9",
                  "b_v35_10",
                  "b_v35_11",
                  "b_v35_12",
                  "b_v35_13",
                  "b_v35_14",
                  "b_v35x0",
                  "b_v36_1",
                  "b_v36_2",
                  "b_v36_3",
                  "b_v36_4",
                  "b_v36_5",
                  "b_v36_6",
                  "b_v36_7",
                  "b_v36_other",
                  "b_v37_1",
                  "b_v37_2",
                  "b_v37_3",
                  "b_v37_4",
                  "b_v37_5",
                  "b_v37_6",
                  "b_v37_7",
                  "b_v37_8",
                  "b_v37_9",
                  "b_v37_10",
                  "b_v37_11",
                  "b_v37_12",
                  "b_v37_13",
                  "b_v37_14",
                  "b_v37_15",
                  "b_v37_other",
                  "b_v39",
                  "b_v40",
                  "b_v46_masked",
                  "b_v47_masked",
                  "e_v7_a",
                  "e_v7_b",
                  "e_v7_c",
                  "e_v7_d",
                  "e_v7_e",
                  "e_v7_f",
                  "e_v7_g",
                  "e_v7_h",
                  "e_v7_other_masked",
                  "e_v8_1_masked",
                  "e_v8_2_masked",
                  "e_v8_3_masked",
                  "e_v8_4_masked",
                  "e_v8_5_masked",
                  "e_v8_6_masked",
                  "e_v8_7_masked",
                  "e_v9_a",
                  "e_v9_b",
                  "e_v9_c",
                  "e_v9_d",
                  "e_v10",
                  "e_v11_a",
                  "e_v11_b",
                  "e_v11_c",
                  "e_v11_d",
                  "e_v11_e",
                  "e_v11_f",
                  "e_v11_g",
                  "e_v11_h",
                  "e_v11_i",
                  "e_v11_j",
                  "e_v12_1_masked",
                  "e_v12_2_masked",
                  "e_v12_3_masked",
                  "e_v12_4_masked",
                  "e_v12_5_masked",
                  "e_v12_6_masked",
                  "e_v12_7_masked",
                  "e_v13_a",
                  "e_v13_b",
                  "e_v13_c",
                  "e_v13_d",
                  "e_v14_masked",
                  "e_v15_2",
                  "e_v15_3",
                  "e_v15_4",
                  "e_v15_5",
                  "e_v15_6",
                  "e_v15_7",
                  "e_v15_8",
                  "e_v15_9",
                  "e_v15_10",
                  "e_v15_11",
                  "e_v15_12",
                  "e_v15_13",
                  "e_v15_14",
                  "e_v15_15",
                  "e_v15_16",
                  "e_v15_masked",
                  "e_v15_other",
                  "e_v15_other_hazardous",
                  "e_v16_1",
                  "e_v16_2",
                  "e_v16_3",
                  "e_v16_4",
                  "e_v16_5",
                  "e_v16_6",
                  "e_v16_7",
                  "e_v16_8",
                  "e_v16_9",
                  "e_v16_10",
                  "e_v16_11",
                  "e_v16_other",
                  "e_v17",
                  "e_v18",
                  "e_v19",
                  "e_v20",
                  "e_v21_6meses",
                  "e_v21_a",
                  "e_v21_b",
                  "e_v21_c",
                  "e_v21_d",
                  "e_v21_e",
                  "e_v21_f",
                  "e_v21_g",
                  "e_v21_h",
                  "e_v21_i",
                  "e_v21_j",
                  "e_v21_k",
                  "e_v21_l",
                  "e_v21_m",
                  "e_v21_n",
                  "e_v22_a",
                  "e_v22_b",
                  "e_v22_c",
                  "e_v22_d",
                  "e_v22_e",
                  "e_v22_f",
                  "e_v22_g",
                  "e_v22_other",
                  "e_v23_a",
                  "e_v23_b",
                  "e_v23_c",
                  "e_v23_d",
                  "e_v23_e",
                  "e_v23_f",
                  "e_v23_g",
                  "e_v23_h",
                  "e_v23_i",
                  "e_v23_j",
                  "e_v23_k",
                  "e_v23_l",
                  "e_v23_m",
                  "e_v23_o_masked",
                  "e_v23_other",
                  "e_v25",
                  "e_v26",
                  "e_v31_masked",
                  "e_v32_masked",
                  "b_afro_2016",
                  "b_any_dustfumes_2016",
                  "b_any_harrassm_2016",
                  "b_any_injury_2016",
                  "b_casual2_2016",
                  "b_casual_2016",
                  "b_chorehrs_2016_masked",
                  "b_vulnerable2_2016",
                  "b_vulnerable_2016",
                  "b_fnat_2016",
                  "b_fspan_2016",
                  "b_gangcurr_2016",
                  "b_gangpast_2016",
                  "b_hcl_2016",
                  "b_hhchores_2016",
                  "b_hz_industry_2016",
                  "b_hz_occupation_2016",
                  "b_indig_2016",
                  "b_irregular_empl2_2016",
                  "b_irregular_empl_2016",
                  "b_long_hrs_2016",
                  "b_married_2016",
                  "b_mestizo_2016",
                  "b_mnat_2016",
                  "b_montub_2016",
                  "b_mspan_2016",
                  "b_night_work_2016",
                  "b_nightchores_2016",
                  "b_older_2016",
                  "b_othrace_2016",
                  "b_ownchild_2016",
                  "b_sameage_2016",
                  "b_temporary_noncasual2_2016",
                  "b_temporary_noncasual_2016",
                  "b_total_hrs_2016_masked",
                  "b_white_2016",
                  "b_working_2016",
                  "b_yearsout_2016",
                  "b_younger_2016",
                  "b_afro_2016_imp",
                  "b_any_dustfumes_2016_imp",
                  "b_any_harrassm_2016_imp",
                  "b_any_injury_2016_imp",
                  "b_chorehrs_2016_imp",
                  "b_fspan_2016_imp",
                  "b_gangcurr_2016_imp",
                  "b_hcl_2016_imp",
                  "b_hz_industry_2016_imp",
                  "b_hz_occupation_2016_imp",
                  "b_irregular_empl2_2016_imp",
                  "b_irregular_empl_2016_imp",
                  "b_long_hrs_2016_imp",
                  "b_married_2016_imp",
                  "b_mestizo_2016_imp",
                  "b_mspan_2016_imp",
                  "b_night_work_2016_imp",
                  "b_nightchores_2016_imp",
                  "b_older_2016_imp",
                  "b_othrace_2016_imp",
                  "b_ownchild_2016_imp",
                  "b_sameage_2016_imp",
                  "b_total_hrs_2016_imp",
                  "b_white_2016_imp",
                  "b_working_2016_imp",
                  "b_yearsout_2016_imp",
                  "b_younger_2016_imp",
                  "e_any_dustfumes_2017",
                  "e_any_harrassm_2017",
                  "e_any_injury_2017",
                  "e_casual_2017",
                  "e_chorehrs_2017_masked",
                  "e_gangcurr_2017",
                  "e_gangpast_2017",
                  "e_hcl_2017",
                  "e_hhchores_2017",
                  "e_hz_industry_2017",
                  "e_hz_occupation_2017",
                  "e_irregular_empl2_2017",
                  "e_irregular_empl_2017",
                  "e_long_hrs_2017",
                  "e_night_work_2017",
                  "e_nightchores_2017",
                  "e_shortcontract_2017",
                  "e_vulnerable_2017",
                  "e_temporary_noncasual_2017",
                  "e_total_hrs_2017_masked",
                  "e_working_2017")

capture_tables (indirect_PII)

# Recode those with very specific values. 

mydata <- mydata[!names(mydata) %in% c("b_v31", "e_v17")] # Too detailed, open-ended verbatim in Spanish

# Recode races or ethnicity to reduce risk of re-identification

break_edu <- c(1,2,3,88,99)
labels_edu <- c("1. White" =1,
                "2. Mestizo" = 2,
                "3. Other" =3)
mydata <- ordinal_recode (variable="b_v9", break_points=break_edu, missing=8888, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## b_v9. What Is Your Race/Ethnicity?
##                   White                 Mestizo Afro, black, or mulatto 
##                      44                     536                      30 
##                Montubio              Indigenous                      88 
##                       7                      18                       2 
##                      99                    <NA> 
##                       1                      38 
##     recoded
##      [1,2) [2,3) [3,88) [88,99) [99,8.89e+03)
##   1     44     0      0       0             0
##   2      0   536      0       0             0
##   3      0     0     30       0             0
##   4      0     0      7       0             0
##   5      0     0     18       0             0
##   88     0     0      0       2             0
##   99     0     0      0       0             1
## [1] "Frequency table after encoding"
## b_v9. What Is Your Race/Ethnicity?
##   1. White 2. Mestizo   3. Other          4          5       <NA> 
##         44        536         55          2          1         38 
## [1] "Inspect value labels and relabel as necessary"
##   1. White 2. Mestizo   3. Other 
##          1          2          3
#Drop variables that shows the same information as b_v9.

dropvars <- c("b_afro_2016","b_indig_2016","b_montub_2016","b_othrace_2016", "b_afro_2016_imp","b_othrace_2016_imp") 
mydata <- mydata[!names(mydata) %in% dropvars]

#Recode mother and father's native language to reduce risk of re-identification

break_edu <- c(1,2,4,5)
labels_edu <- c("1. Spanish" =1,
                "2. Other" = 2,
                "3. Not sure"=3,
                "4. No response"=4)
mydata <- ordinal_recode (variable="b_v7", break_points=break_edu, missing=8888, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## b_v7. What Was Your Mother'S Mother Tongue?
##          Spanish  Native language Another language         Not sure      No response 
##              569               25                2               40                2 
##             <NA> 
##               38 
##     recoded
##      [1,2) [2,4) [4,5) [5,8.89e+03)
##   1    569     0     0            0
##   2      0    25     0            0
##   3      0     2     0            0
##   4      0     0    40            0
##   99     0     0     0            2
## [1] "Frequency table after encoding"
## b_v7. What Was Your Mother'S Mother Tongue?
##     1. Spanish       2. Other    3. Not sure 4. No response           <NA> 
##            569             27             40              2             38 
## [1] "Inspect value labels and relabel as necessary"
##     1. Spanish       2. Other    3. Not sure 4. No response 
##              1              2              3              4
mydata <- ordinal_recode (variable="b_v8", break_points=break_edu, missing=8888, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## b_v8. What Was Your Father'S Mother Tongue?
##          Spanish  Native language Another language         Not sure      No response 
##              539               23                3               71                2 
##             <NA> 
##               38 
##     recoded
##      [1,2) [2,4) [4,5) [5,8.89e+03)
##   1    539     0     0            0
##   2      0    23     0            0
##   3      0     3     0            0
##   4      0     0    71            0
##   99     0     0     0            2
## [1] "Frequency table after encoding"
## b_v8. What Was Your Father'S Mother Tongue?
##     1. Spanish       2. Other    3. Not sure 4. No response           <NA> 
##            539             26             71              2             38 
## [1] "Inspect value labels and relabel as necessary"
##     1. Spanish       2. Other    3. Not sure 4. No response 
##              1              2              3              4

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('age_at_base', 'female', 'b_v9') ##!!! Replace with candidate categorical demo vars

# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 676 rows and 558 variables.
##   --> Categorical key variables: age_at_base, female, b_v9
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories     Mean size           Size of smallest (>0)      
##   age_at_base                    4 (4)   169.000 (169.000)                    40  (40)
##        female                    2 (2)   338.000 (338.000)                   260 (260)
##          b_v9                    6 (6)   127.600 (127.600)                     1   (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 0 (0.000%)
##   - 3-anonymity: 0 (0.000%)
##   - 5-anonymity: 2 (0.296%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 

!!! No Open-ends

GPS data: Displace

# !!!No GPS data

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)