rm(list=ls(all=t))

Setup filenames, data, functions and create dictionary for dataset review

filename <- "Malawi_Child_Public Use" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!No Direct PII

Direct PII-team: Encode field team names

# !!!No Direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

#  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("community",
             "b_community",
             "e_community",
             "b_ta",
             "b_comm") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## community. Community
##                    Chaola  Chazim'bobo     Chikho 2     Chinyata      Choumba     Kakoloha 
##        10417          363          313         1146          657          627          288 
##      Kanongo       Luwira       Mafuta     Mkombezi       Mlambe      Mzokoto     Nanzomba 
##          842          237          345          552          496          625          584 
##       Ndaula     Nyongani      Pondani Tamanimwendo    Waliranji 
##          729           73          596          552          811 
## [1] "Frequency table after encoding"
## community. Community
##   841   842   843   844   845   846   847   848   849   850   851   852   853   854   855   856 
##    73   657   288 10417   552   237   496   842   552   363   596   345  1146   627   625   729 
##   857   858   859 
##   811   313   584 
## [1] "Frequency table before encoding"
## b_community. b_Community
##                    Chaola  Chazim'bobo     Chikho 2     Chinyata      Choumba     Kakoloha 
##        18844           65           64          161           73           88           40 
##      Kanongo       Luwira       Mafuta     Mkombezi       Mlambe      Mzokoto     Nanzomba 
##           92           52           55           76           72           76           79 
##       Ndaula     Nyongani      Pondani Tamanimwendo    Waliranji 
##          101            4          115           68          128 
## [1] "Frequency table after encoding"
## b_community. b_Community
##  1001  1002  1003  1004  1005  1006  1007  1008  1009  1010  1011  1012  1013  1014  1015  1016 
##    92     4 18844    55    40   128    76    65    79    52    64    68    72   161    76    88 
##  1017  1018  1019 
##   101   115    73 
## [1] "Frequency table before encoding"
## e_community. e_Community
##                    Chaola  Chazim'bobo     Chikho 2     Chinyata      Choumba     Kakoloha 
##        10417          363          313         1146          657          627          288 
##      Kanongo       Luwira       Mafuta     Mkombezi       Mlambe      Mzokoto     Nanzomba 
##          842          237          345          552          496          625          584 
##       Ndaula     Nyongani      Pondani Tamanimwendo    Waliranji 
##          729           73          596          552          811 
## [1] "Frequency table after encoding"
## e_community. e_Community
##   757   758   759   760   761   762   763   764   765   766   767   768   769   770   771   772 
##   842   811   496   584  1146   552   729   363   657   625   313   288    73   345 10417   596 
##   773   774   775 
##   552   237   627 
## [1] "Frequency table before encoding"
## b_ta. b_Traditional Authority
##                    KASAKULA       MAVWERE MWANKHUNIKILA 
##         11610          3273          4063          1307 
## [1] "Frequency table after encoding"
## b_ta. b_Traditional Authority
##   727   728   729   730 
##  1307 11610  4063  3273 
## [1] "Frequency table before encoding"
## b_comm. b_Community
##                      CHAOLA   CHAZIM'BOBO      CHIKHO 2      CHINYATA       CHOUMBA 
##         11610           117           273          1317           556           557 
##      KAKOLOHA       KANONGO        LUWIRA        MAFUTA      MKOMBEZI        MLAMBE 
##           183           718           215           323           433           406 
##       MZOKOTO      NANZOMBA        NDAULA      NYONGANI       PONDANI TAMANI MWENDO 
##           476           557           641            49           605           475 
##     WALIRANJI 
##           742 
## [1] "Frequency table after encoding"
## b_comm. b_Community
##   700   701   702   703   704   705   706   707   708   709   710   711   712   713   714   715 
##   557   605   476   718   406   556   183   557   742   641   273   215   117   475   433   323 
##   716   717   718 
##  1317 11610    49

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("num_people", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members. 
## [1] "Frequency table before encoding"
## num_people. Can You Please Tell Me How Many People Live In This Household, Including Yoursel
##     2     3     4     5     6     7     8     9    10    11    12    13    14  <NA> 
##    77   391  1194  2073  2379  1874  1022   473   189   115    37     4     8 10417

## [1] "Frequency table after encoding"
## num_people. Can You Please Tell Me How Many People Live In This Household, Including Yoursel
##          2          3          4          5          6          7          8          9 
##         77        391       1194       2073       2379       1874       1022        473 
## 10 or more       <NA> 
##        353      10417

mydata <- top_recode ("b_num_people", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members. 
## [1] "Frequency table before encoding"
## b_num_people. b_Can You Please Tell Me How Many People Live In This Household, Including Yours
##     2     3     4     5     6     7     8     9    10    11    12    14  <NA> 
##     9    44   132   287   360   316   158    61    26    10     5     1 18844

## [1] "Frequency table after encoding"
## b_num_people. b_Can You Please Tell Me How Many People Live In This Household, Including Yours
##          2          3          4          5          6          7          8          9 
##          9         44        132        287        360        316        158         61 
## 10 or more       <NA> 
##         42      18844

mydata <- top_recode("b_child_number",  break_point=6, missing=c(888, 999999)) # Topcode cases with 7 or more child household members.
## [1] "Frequency table before encoding"
## b_child_number. b_Child Number To Be Interviewed In This Household
##     1     2     3     4     5     6     7     8     9  <NA> 
##  3676  2604  1507   629   171    46     5     2     2 11611

## [1] "Frequency table after encoding"
## b_child_number. b_Child Number To Be Interviewed In This Household
##         1         2         3         4         5 6 or more      <NA> 
##      3676      2604      1507       629       171        55     11611

mydata <- top_recode ("e_num_people", break_point=10, missing=c(888, 999999)) # Topcode cases with 10 or more household members.
## [1] "Frequency table before encoding"
## e_num_people. e_Can You Please Tell Me How Many People Live In This Household, Including Yours
##     2     3     4     5     6     7     8     9    10    11    12    13    14  <NA> 
##    77   391  1194  2073  2379  1874  1022   473   189   115    37     4     8 10417

## [1] "Frequency table after encoding"
## e_num_people. e_Can You Please Tell Me How Many People Live In This Household, Including Yours
##          2          3          4          5          6          7          8          9 
##         77        391       1194       2073       2379       1874       1022        473 
## 10 or more       <NA> 
##        353      10417

Indirect PII - Categrical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("yardwork",
                  "bird",
                  "firewood",
                  "hrsweek",
                  "b_yardwork",
                  "b_bird",
                  "b_firewood",
                  "b_bar",
                  "e_num_people",
                  "e_yardwork",
                  "e_bird",
                  "e_firewood",
                  "b_youngchild_a",
                  "b_youngchild_b",
                  "b_youngchild_c",
                  "b_youngchild_d",
                  "b_youngchild_e",
                  "b_youngchild_f",
                  "b_youngchild_g",
                  "b_youngchild_h",
                  "b_youngchild_i",
                  "b_youngchild_j",
                  "b_d2_a",
                  "b_d2_b",
                  "b_d2_c",
                  "b_d2_d",
                  "b_d3c",
                  "b_w2a",
                  "b_kidelse_a",
                  "b_w3c_a",
                  "b_w2b",
                  "b_kidelse_b",
                  "b_w3c_b",
                  "b_w2c",
                  "b_kidelse_c",
                  "b_w3c_c",
                  "b_w2d",
                  "b_kidelse_d",
                  "b_w3c_d",
                  "b_w2e",
                  "b_kidelse_e",
                  "b_w3c_e",
                  "b_w2f",
                  "b_kidelse_f",
                  "b_w3c_f",
                  "b_w2g",
                  "b_kidelse_g",
                  "b_w3c_g",
                  "b_w2h",
                  "b_kidelse_h",
                  "b_w3c_h",
                  "b_w2i",
                  "b_kidelse_i",
                  "b_w3c_i",
                  "b_w2j",
                  "b_kidelse_j",
                  "b_w3c_j",
                  "b_w2k",
                  "b_w2l",
                  "b_w3c_l",
                  "b_w2m",
                  "b_kidelse_m",
                  "b_w3c_m",
                  "b_w2o",
                  "b_kidelse_o",
                  "b_w3c_o",
                  "b_w2p",
                  "b_kidelse_p",
                  "b_w3c_p",
                  "b_w2q",
                  "b_kidelse_q",
                  "b_w3c_q",
                  "b_w2r",
                  "b_kidelse_r",
                  "b_w3c_r",
                  "b_kidtobseas",
                  "b_w4a",
                  "b_w4b",
                  "b_w5",
                  "b_hw1a",
                  "b_hw1b",
                  "b_hw1c",
                  "b_hw1d",
                  "b_hw1e",
                  "b_hw1f",
                  "b_hw1g",
                  "b_hw1i",
                  "b_hw1k",
                  "b_hw1m",
                  "b_hw1n",
                  "b_hw1o",
                  "b_hw2",
                  "b_hw3",
                  "b_hw4a",
                  "b_hw4b",
                  "b_hw4c",
                  "b_hw4d",
                  "b_hw4e",
                  "b_hw4f",
                  "b_hw4g",
                  "b_hw4h",
                  "b_hw4i",
                  "b_hw4j",
                  "b_hw4k",
                  "b_hw4l",
                  "b_hw4m",
                  "b_hw4n",
                  "b_hw5a",
                  "b_hw5b",
                  "b_hw5c",
                  "b_hw5d",
                  "e_youndchild_a",
                  "e_youndchild_b",
                  "e_youndchild_c",
                  "e_youndchild_d",
                  "e_youndchild_e",
                  "e_youndchild_f",
                  "e_youndchild_g",
                  "e_youndchild_h",
                  "e_youndchild_i",
                  "e_youndchild_j",
                  "e_d3c",
                  "e_kidemp_a",
                  "e_kidemp_b",
                  "e_kidemp_c",
                  "e_kidemp_d",
                  "e_kidemp_e",
                  "e_kidemp_f",
                  "e_kidemp_g",
                  "e_kidemp_h",
                  "e_kidemp_i",
                  "e_kidemp_j",
                  "e_kidemp_k",
                  "e_kidemp_l",
                  "e_kidemp_m",
                  "e_kidemp_n",
                  "e_kidemp_o",
                  "e_kidemp_p",
                  "e_kidemp_q",
                  "e_kidemp_r",
                  "e_kidelse_b",
                  "e_kidelse_c",
                  "e_kidelse_d",
                  "e_kidelse_e",
                  "e_kidelse_f",
                  "e_kidelse_g",
                  "e_kidelse_h",
                  "e_kidelse_i",
                  "e_kidelse_j",
                  "e_kidelse_k",
                  "e_kidelse_l",
                  "e_kidelse_m",
                  "e_kidelse_n",
                  "e_kidelse_o",
                  "e_kidelse_p",
                  "e_kidelse_q",
                  "e_kidelse_r",
                  "e_w3_4a",
                  "e_w3_4b",
                  "e_w3_4c",
                  "e_w3_4d",
                  "e_w3_4e",
                  "e_w3_4h",
                  "e_w3_4i",
                  "e_w3_4j",
                  "e_w3_4l",
                  "e_w3_4n",
                  "e_w3_4o",
                  "e_w3_4p",
                  "e_w3_4q",
                  "e_w3_4r",
                  "e_tobseas",
                  "e_w4a",
                  "e_w4b",
                  "e_w5",
                  "e_hw1_f",
                  "e_hw1_g",
                  "e_hw1_l",
                  "e_hw1_m",
                  "e_hw1_n",
                  "e_hw1_o",
                  "e_hw2",
                  "e_hw3",
                  "e_hw3_e",
                  "e_hw4_a",
                  "e_hw4_b",
                  "e_hw4_c",
                  "e_hw4_d",
                  "e_hw4_e",
                  "e_hw4_h",
                  "e_hw4_i",
                  "e_hw4_j",
                  "e_hw4_k",
                  "e_hw4_l",
                  "e_hw4_m",
                  "e_hw4_n",
                  "e_hw5_a",
                  "e_hw5_b",
                  "e_hw5_c")

capture_tables (indirect_PII)

# Recode those with very specific values.

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age

selectedKeyVars = c("sex_","childage", "b_d4") ##!!! Replace with candidate categorical demo vars
selectedHouseholdID = c('hhid') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars, 
                           hhId = selectedHouseholdID)
## Warning in cbind(reshier, unique(dataX[, 1])): number of rows of result is not a multiple of
## vector length (arg 1)
sdcInitial
## The input dataset consists of 20253 rows and 724 variables.
##   --> Categorical key variables: sex_, childage, b_d4
##   --> Cluster/Household-Id variable: hhid
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size            Size of smallest (>0)       
##          sex_                    3  (3)  4918.000 (4918.000)                  4821 (4821)
##      childage                   14 (14)   893.077  (893.077)                   537  (537)
##          b_d4                   16 (16)   207.933  (207.933)                     1    (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 0 (0.000%)
##   - 3-anonymity: 0 (0.000%)
##   - 5-anonymity: 0 (0.000%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! No Open-Ends

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)