rm(list=ls(all=t))

Setup filenames

filename <- "Rwanda_Public Use" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

!!!No direct PII

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

!!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

!!!No direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000 Remove redundant small location information

mydata <- mydata[!names(mydata) %in% c("nsector", 
                                       "sectordum1",
                                       "sectordum2",
                                       "sectordum3",
                                       "sectordum4",
                                       "sectordum5",
                                       "sectordum6",
                                       "sectordum7",
                                       "sectordum8",
                                       "sectordum9",
                                       "sectordum10",
                                       "sectordum11",
                                       "sectordum12",
                                       "sectordum13",
                                       "sectordum14",
                                       "sectordum15",
                                       "sectordum15")]

Relabel small locations

var_label(mydata$b_sectordum1) <- "Bl_Sector==1"
var_label(mydata$b_sectordum2) <- "Bl_Sector==2"
var_label(mydata$b_sectordum3) <- "Bl_Sector==3"
var_label(mydata$b_sectordum4) <- "Bl_Sector==4"
var_label(mydata$b_sectordum5) <- "Bl_Sector==5"
var_label(mydata$b_sectordum6) <- "Bl_Sector==6"
var_label(mydata$b_sectordum7) <- "Bl_Sector==7"
var_label(mydata$b_sectordum8) <- "Bl_Sector==8"
var_label(mydata$b_sectordum9) <- "Bl_Sector==9"
var_label(mydata$b_sectordum10) <- "Bl_Sector==10"
var_label(mydata$b_sectordum11) <- "Bl_Sector==11"
var_label(mydata$b_sectordum12) <- "Bl_Sector==12"
var_label(mydata$b_sectordum13) <- "Bl_Sector==13"
var_label(mydata$b_sectordum14) <- "Bl_Sector==14"
var_label(mydata$b_sectordum15) <- "Bl_Sector==15"

locvars <- c("e_v1d", "e_v4d") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## e_v1d. T: Mfs Site
##             BURUHUKIRO                 GATARE                NKOMANE              UWINKINGI 
##                     17                     21                     24                     30 
##                 RUHERU              NYABIMATA                MUGANZA                 NKUNGU 
##                     35                     25                     14                     24 
##                 GIHEKE                KAMEMBE               RUGABANO                MUTUNTU 
##                     44                     12                     57                     19 
##                GASHALI                 KAVUMU                MUHANDA Other (please specify) 
##                     20                     27                     21                     40 
##                   <NA> 
##                    532 
## [1] "Frequency table after encoding"
## e_v1d. T: Mfs Site
##  554  555  556  557  558  559  560  561  562  563  564  565  566  567  568  569 <NA> 
##   14   12   40   19   24   21   57   27   24   30   44   21   17   35   20   25  532 
## [1] "Frequency table before encoding"
## e_v4d. C: Mfs Site
##             BURUHUKIRO                 GATARE                 NKUNGU               RUGABANO 
##                      4                      1                      1                      4 
##                MUTUNTU                 KAVUMU                MUHANDA Other (please specify) 
##                      1                      3                      1                      1 
##                   <NA> 
##                    946 
## [1] "Frequency table after encoding"
## e_v4d. C: Mfs Site
##  904  905  906  907  908  909  910  911 <NA> 
##    4    3    1    1    1    1    1    4  946

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" of 10 or less. 

break_edu <- c(1,3,9,15,88)
labels_edu <- c("1. No school, Kindergarten/Nursery" =1,
                "2. Primary (P1-P6)" = 2,
                "3. Secondary (S1-S6)" =3, 
                "4. Vocational training or College/University" =4,
                "5. Don't know" =5)
mydata <- ordinal_recode (variable="b_hh8", break_points=break_edu, missing=8888, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## b_hh8. Hh8. Woman Education
##           No school                  P1                  P2                  P3                  P4 
##                  18                   6                   7                  18                  42 
##                  P5                  P6                  S1                  S2                  S3 
##                  86                 244                  89                  69                  49 
##                  S4                  S5                  S6 Vocational training  College/University 
##                  14                  19                  50                   3                   4 
##          Don't know                <NA> 
##                  45                 199 
##     recoded
##      [1,3) [3,9) [9,15) [15,88) [88,8.89e+03)
##   1     18     0      0       0             0
##   3      0     6      0       0             0
##   4      0     7      0       0             0
##   5      0    18      0       0             0
##   6      0    42      0       0             0
##   7      0    86      0       0             0
##   8      0   244      0       0             0
##   9      0     0     89       0             0
##   10     0     0     69       0             0
##   11     0     0     49       0             0
##   12     0     0     14       0             0
##   13     0     0     19       0             0
##   14     0     0     50       0             0
##   15     0     0      0       3             0
##   16     0     0      0       4             0
##   88     0     0      0       0            45
## [1] "Frequency table after encoding"
## b_hh8. Hh8. Woman Education
##           1. No school, Kindergarten/Nursery                           2. Primary (P1-P6) 
##                                           18                                          403 
##                         3. Secondary (S1-S6) 4. Vocational training or College/University 
##                                          290                                            7 
##                                5. Don't know                                         <NA> 
##                                           45                                          199 
## [1] "Inspect value labels and relabel as necessary"
##           1. No school, Kindergarten/Nursery                           2. Primary (P1-P6) 
##                                            1                                            2 
##                         3. Secondary (S1-S6) 4. Vocational training or College/University 
##                                            3                                            4 
##                                5. Don't know 
##                                            5
mydata <- ordinal_recode (variable="b_hh9", break_points=break_edu, missing=8888, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## b_hh9. Hh9. Man Education
##            No school Kindergarten/Nursery                   P1                   P2 
##                   28                    1                    5                   16 
##                   P3                   P4                   P5                   P6 
##                   27                   54                  109                  191 
##                   S1                   S2                   S3                   S4 
##                   56                   72                   48                   16 
##                   S5                   S6  Vocational training   College/University 
##                   17                   58                    3                    6 
##           Don't know                 <NA> 
##                   56                  199 
##     recoded
##      [1,3) [3,9) [9,15) [15,88) [88,8.89e+03)
##   1     28     0      0       0             0
##   2      1     0      0       0             0
##   3      0     5      0       0             0
##   4      0    16      0       0             0
##   5      0    27      0       0             0
##   6      0    54      0       0             0
##   7      0   109      0       0             0
##   8      0   191      0       0             0
##   9      0     0     56       0             0
##   10     0     0     72       0             0
##   11     0     0     48       0             0
##   12     0     0     16       0             0
##   13     0     0     17       0             0
##   14     0     0     58       0             0
##   15     0     0      0       3             0
##   16     0     0      0       6             0
##   88     0     0      0       0            56
## [1] "Frequency table after encoding"
## b_hh9. Hh9. Man Education
##           1. No school, Kindergarten/Nursery                           2. Primary (P1-P6) 
##                                           29                                          402 
##                         3. Secondary (S1-S6) 4. Vocational training or College/University 
##                                          267                                            9 
##                                5. Don't know                                         <NA> 
##                                           56                                          199 
## [1] "Inspect value labels and relabel as necessary"
##           1. No school, Kindergarten/Nursery                           2. Primary (P1-P6) 
##                                            1                                            2 
##                         3. Secondary (S1-S6) 4. Vocational training or College/University 
##                                            3                                            4 
##                                5. Don't know 
##                                            5
# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("b_hh_masked", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## b_hh_masked. Hh. Household Size
##    2    3    4    5    6    7    8    9   10   11   12  13+ <NA> 
##   16   55   57  113  115  148  104   76   48   21    5    5  199

## [1] "Frequency table after encoding"
## b_hh_masked. Hh. Household Size
##          2          3          4          5          6          7          8          9 10 or more 
##         16         55         57        113        115        148        104         76         79 
##       <NA> 
##        199

mydata <- top_recode ("hh", break_point=10, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## hh. Hh. Household Size
##   2   3   4   5   6   7   8   9  10  11  12  13  14 
##  23  71  74 139 148 182 136  91  59  27   7   4   1

## [1] "Frequency table after encoding"
## hh. Hh. Household Size
##          2          3          4          5          6          7          8          9 10 or more 
##         23         71         74        139        148        182        136         91         98

# TOp code number of household adults and household adults working.

mydata <- top_recode ("b_hh6_masked", break_point=6, missing=999999) # Topcode cases with 10 or more individuals. 
## [1] "Frequency table before encoding"
## b_hh6_masked. Hh6. Adults>18
## 1 or less         2         3         4         5         6         7        8+      <NA> 
##        97       266       187       100        68        26         9        10       199

## [1] "Frequency table after encoding"
## b_hh6_masked. Hh6. Adults>18
## 1 or less         2         3         4         5 6 or more      <NA> 
##        97       266       187       100        68        45       199

mydata <- top_recode ("b_hh7_masked", break_point=4, missing=999999) # Topcode cases with 10 or more individuals.
## [1] "Frequency table before encoding"
## b_hh7_masked. Hh7. Adults>18 Working
##    0    1    2    3    4    5    6   7+ <NA> 
##  278  184  183   66   25   15    6    6  199

## [1] "Frequency table after encoding"
## b_hh7_masked. Hh7. Adults>18 Working
##         0         1         2         3 4 or more      <NA> 
##       278       184       183        66        52       199

# Dates of birth removed, as strong identifier and ages already provided in separate variables

mydata <- mydata[!names(mydata) %in% c("b_d2_month", "e_d1a_month")]

# !!!Include relevant variables in list below

indirect_PII <- c("b_age_imputed",
                  "b_conditions",
                  "b_filliterate",
                  "b_fjunsecondary",
                  "b_fprimary",
                  "b_fsensecondary",
                  "b_ftertiary",
                  "b_harassment",
                  "b_hclprevalence",
                  "b_hclprevalence_project",
                  "b_hcltothrs",
                  "b_healthissues",
                  "b_hh_below15_masked",
                  "b_hrsconstruction",
                  "b_hrsdomestic",
                  "b_hrsfarming",
                  "b_hrsfire",
                  "b_hrshandcraft",
                  "b_hrslivestock",
                  "b_hrsmore100",
                  "b_hrsother",
                  "b_hrswashing",
                  "b_hrswater",
                  "b_hrsworked_tot",
                  "b_hrsworked_tot_nodom_masked",
                  "b_hw1_e_dum",
                  "b_hw1_f_dum",
                  "b_hw1_g_dum",
                  "b_hw1_h_dum",
                  "b_hw1_i_dum",
                  "b_hw1_j_dum",
                  "b_hw1_k_dum",
                  "b_hw1_l_dum",
                  "b_hw1_m_dum",
                  "b_hw1_n_dum",
                  "b_hw1_o_dum",
                  "b_hw1_p_dum",
                  "b_hw1_q_dum",
                  "b_hw1_r_dum",
                  "b_hw1_s_dum",
                  "b_hw1_t_dum",
                  "b_hw1_u_dum",
                  "b_hw1_v_dum",
                  "b_hw1_w_dum",
                  "b_hw1_x_dum",
                  "b_hw1_y_dum",
                  "b_hw1_z_dum",
                  "b_hw1_st_dum",
                  "b_hw1_yz_dum",
                  "b_i1_a_dum",
                  "b_i1_b_dum",
                  "b_i1_c_dum",
                  "b_i1_d_dum",
                  "b_i1_e_dum",
                  "b_i1_f_dum",
                  "b_i1_g_dum",
                  "b_i1_h_dum",
                  "b_i1_i_dum",
                  "b_i1_j_dum",
                  "b_i1_k_dum",
                  "b_i1_l_dum",
                  "b_i1_nonpoultry",
                  "b_institutions",
                  "b_interview_month",
                  "b_interview_year",
                  "b_lastjunsecondary",
                  "b_lastnosch",
                  "b_lastprimary",
                  "b_lastsensecondary",
                  "b_lastvoctraining",
                  "b_locations",
                  "b_machineryuse",
                  "b_mfs_agrwithoutprotection",
                  "b_milliterate",
                  "b_minor",
                  "b_mjunsecondary",
                  "b_mprimary",
                  "b_msensecondary",
                  "b_mtertiary",
                  "b_repetition",
                  "b_single",
                  "b_useproduct",
                  "e_activities",
                  "e_HRS_WKD",
                  "e_HRS_WKD2",
                  "e_HZAG",
                  "e_NO_REST_DAY",
                  "e_conditions",
                  "e_conditions2",
                  "e_harassment",
                  "e_healthissues",
                  "e_hlprevalence",
                  "e_institutions",
                  "e_locations",
                  "e_machineryuse",
                  "e_mfs_agwoprotect",
                  "e_minor",
                  "e_protectivegear",
                  "e_tot_hrs_pastweek_masked",
                  "e_useproduct",
                  "female",
                  "age_imputed",
                  "single",
                  "hw1m_d",
                  "hh1",
                  "hh5",
                  "i1d",
                  "i2",
                  "b_q17_ownphone",
                  "b_d1_female",
                  "b_d2_year",
                  "b_d3",
                  "b_d4",
                  "b_d4a",
                  "b_d4b",
                  "b_d4b_days",
                  "b_d5a",
                  "b_d5b",
                  "b_d5c",
                  "b_w1a",
                  "b_w1a_work",
                  "b_w1a1",
                  "b_w1a2",
                  "b_w1a3",
                  "b_w1b",
                  "b_w1b1",
                  "b_w1b2",
                  "b_w1b3",
                  "b_w1c",
                  "b_w1c1",
                  "b_w1c2",
                  "b_w1c3",
                  "b_w1d",
                  "b_w1d1",
                  "b_w1d2",
                  "b_w1d3",
                  "b_w1e",
                  "b_w1e1",
                  "b_w1e2",
                  "b_w1e3",
                  "b_w1f",
                  "b_w1f1",
                  "b_w1f2",
                  "b_w1f3",
                  "b_w2a",
                  "b_w2b",
                  "b_w2c",
                  "b_w2d",
                  "b_w2e",
                  "b_w2f",
                  "b_w2g",
                  "b_w2h",
                  "b_w2i",
                  "b_w2_other",
                  "b_w3_1a",
                  "b_w3_1b",
                  "b_w3_1c",
                  "b_w3_1d",
                  "b_w3_1e_masked",
                  "b_w3_2a",
                  "b_w3_2b",
                  "b_w3_2c",
                  "b_w3_2d",
                  "b_w3_2e_masked",
                  "b_w3_3a",
                  "b_w3_3b",
                  "b_w3_3c",
                  "b_w3_3d",
                  "b_w3_3e_masked",
                  "b_w3_4a",
                  "b_w3_4b",
                  "b_w3_4c",
                  "b_w3_4d",
                  "b_w3_4e_masked",
                  "b_w3_5a",
                  "b_w3_5b",
                  "b_w3_5c",
                  "b_w3_5d",
                  "b_w3_5e_masked",
                  "b_w3_6a",
                  "b_w3_6b",
                  "b_w3_6c",
                  "b_w3_6d",
                  "b_w3_6e_masked",
                  "b_w3_7a",
                  "b_w3_7b",
                  "b_w3_7c",
                  "b_w3_7d",
                  "b_w3_7e_masked",
                  "b_w3_8a",
                  "b_w3_8b",
                  "b_w3_8c",
                  "b_w3_8d",
                  "b_w3_8e_masked",
                  "b_w3_9a",
                  "b_w3_9b",
                  "b_w3_9c",
                  "b_w3_9d",
                  "b_w3_9e_masked",
                  "b_w3_10a",
                  "b_w3_10b",
                  "b_w3_10c",
                  "b_w3_10d",
                  "b_w3_10e_masked",
                  "b_w3_11",
                  "b_w3_12",
                  "b_w3_13",
                  "b_w3_14",
                  "b_w3_15",
                  "b_w3_16",
                  "b_w3_17",
                  "b_w3_21",
                  "b_w3_22",
                  "b_w3_23",
                  "b_w3_24",
                  "b_w3_25",
                  "b_w3_26",
                  "b_w3_27",
                  "b_w3_31",
                  "b_w3_32",
                  "b_w3_33",
                  "b_w3_34",
                  "b_w3_35",
                  "b_w3_36",
                  "b_w3_37",
                  "b_w3_41",
                  "b_w3_42",
                  "b_w3_43",
                  "b_w3_44",
                  "b_w3_45",
                  "b_w3_46",
                  "b_w3_47",
                  "b_w3_51",
                  "b_w3_52",
                  "b_w3_53",
                  "b_w3_54",
                  "b_w3_55",
                  "b_w3_56",
                  "b_w3_57",
                  "b_w3_61",
                  "b_w3_62",
                  "b_w3_63",
                  "b_w3_64",
                  "b_w3_65",
                  "b_w3_66",
                  "b_w3_67",
                  "b_w3_71",
                  "b_w3_72",
                  "b_w3_73",
                  "b_w3_74",
                  "b_w3_75",
                  "b_w3_76",
                  "b_w3_77",
                  "b_w3_81",
                  "b_w3_82",
                  "b_w3_83",
                  "b_w3_84",
                  "b_w3_85",
                  "b_w3_86",
                  "b_w3_87",
                  "b_w3_91",
                  "b_w3_92",
                  "b_w3_93",
                  "b_w3_94",
                  "b_w3_95",
                  "b_w3_96",
                  "b_w3_97",
                  "b_w3_101",
                  "b_w3_102",
                  "b_w3_103",
                  "b_w3_104",
                  "b_w3_105",
                  "b_w3_106",
                  "b_w3_107",
                  "b_w4b_1",
                  "b_w4b_2",
                  "b_w4b_3",
                  "b_w4b_4",
                  "b_w4b_5",
                  "b_w4b_6",
                  "b_w4b_7",
                  "b_w4b_8",
                  "b_w4b_9",
                  "b_w4b_10",
                  "b_w4b_other",
                  "b_w5_1",
                  "b_w5_2",
                  "b_w5_3",
                  "b_w5_4",
                  "b_w5_5",
                  "b_w5_6",
                  "b_w6_1",
                  "b_w6_2",
                  "b_w6_3",
                  "b_w6_4",
                  "b_w6_5",
                  "b_w6_6",
                  "b_w6_7",
                  "b_w7_1",
                  "b_w7_2",
                  "b_hw1_a",
                  "b_hw1_b",
                  "b_hw1_c",
                  "b_hw1_d",
                  "b_hw1_e",
                  "b_hw1_f",
                  "b_hw1_farming",
                  "b_hw1_g",
                  "b_hw1_h",
                  "b_hw1_i",
                  "b_hw1_j",
                  "b_hw1_k",
                  "b_hw1_l",
                  "b_hw1_m",
                  "b_hw1_n",
                  "b_hw1_o",
                  "b_hw1_oth",
                  "b_hw1_p",
                  "b_hw1_q",
                  "b_hw1_r",
                  "b_hw1_s",
                  "b_hw1_t",
                  "b_hw1_u",
                  "b_hw1_v",
                  "b_hw1_w",
                  "b_hw1_x",
                  "b_hw1_y",
                  "b_hw1_z",
                  "b_hw2_a",
                  "b_hw2_b",
                  "b_hw2_c",
                  "b_hw2_d",
                  "b_hw2_e",
                  "b_hw2_f",
                  "b_hw2_g",
                  "b_hw2_h",
                  "b_hw2_i",
                  "b_hw2_j",
                  "b_hw2_k",
                  "b_hw2_l",
                  "b_hw2_m",
                  "b_hw2_n",
                  "b_hw2_o",
                  "b_hw3_a",
                  "b_hw3_b",
                  "b_hw3_c",
                  "b_hw3_d",
                  "b_hw3_e",
                  "b_hw3_f",
                  "b_hw3_g",
                  "b_hw3_h",
                  "b_hw3_i",
                  "b_hw3_j",
                  "b_hw3_k",
                  "b_hw3_l",
                  "b_hw3_m",
                  "b_hw4_a",
                  "b_hw4_b",
                  "b_hw4_c",
                  "b_hw4_d",
                  "b_hw4_e",
                  "b_hw4_f",
                  "b_hw4_g",
                  "b_hw4_h",
                  "b_hw4_i",
                  "b_hw4_j",
                  "b_hw4_k",
                  "b_hw4_l",
                  "b_hw4_m",
                  "b_hw4_n_other1",
                  "b_hw4_n_other2",
                  "b_hw5_a",
                  "b_hw5_b",
                  "b_hw5_c",
                  "b_hw5_d",
                  "b_hh1_masked",
                  "b_hh2_masked",
                  "b_hh3_masked",
                  "b_hh4_masked",
                  "b_hh5",
                  "b_i1_b_masked",
                  "b_i1_c_masked",
                  "b_i1_d_masked",
                  "b_i1_e_masked",
                  "b_i1_f_masked",
                  "b_i1_g_masked",
                  "b_i1_h_masked",
                  "b_i1_i_masked",
                  "b_i1_j_masked",
                  "b_i1_k_masked",
                  "b_i1_l_masked",
                  "b_i2",
                  "b_i2plots_masked",
                  "b_i3plots_masked",
                  "b_ci21",
                  "e_interview_month",
                  "e_interview_year",
                  "e_q14",
                  "e_d1a_year_masked",
                  "e_d1b_masked",
                  "e_d2",
                  "e_d3a",
                  "e_d3b",
                  "e_d3bsp_masked",
                  "e_w1a_tea",
                  "e_w1a_coff",
                  "e_w1a_rice",
                  "e_w1b_tea",
                  "e_w1b_coff",
                  "e_w1b_rice",
                  "e_w1c_a",
                  "e_w1c_b",
                  "e_w1c_c",
                  "e_w1c_d",
                  "e_w1c_e",
                  "e_w1c_f",
                  "e_w1c_g",
                  "e_w1c_h",
                  "e_w1c_i",
                  "e_w1c_j",
                  "e_w1c_k",
                  "e_w1c_l",
                  "e_w1d1a",
                  "e_w1d1b",
                  "e_w1d1c",
                  "e_w1d1d",
                  "e_w1d1e",
                  "e_w1d1f",
                  "e_w1d1g",
                  "e_w1d1h",
                  "e_w1d1i",
                  "e_w1d1j",
                  "e_w1d1k",
                  "e_w1d1l",
                  "e_w1d1m",
                  "e_w1d1n",
                  "e_w1d1o",
                  "e_w1d1p",
                  "e_w1d1q",
                  "e_w1d1r",
                  "e_w1d1s",
                  "e_w1d1t",
                  "e_w1d1u",
                  "e_w1d1v",
                  "e_w1d1z",
                  "e_w1d2a",
                  "e_w1d2b",
                  "e_w1d2c",
                  "e_w1d2d",
                  "e_w1d2e",
                  "e_w1d2f",
                  "e_w1d2g",
                  "e_w1d2h",
                  "e_w1d2i",
                  "e_w1d2j",
                  "e_w1d2k",
                  "e_w1d2l",
                  "e_w1d2m",
                  "e_w1d2n",
                  "e_w1d2o",
                  "e_w1d2p",
                  "e_w1d2q",
                  "e_w1d2r",
                  "e_w1d2s",
                  "e_w1d2t",
                  "e_w1d2u",
                  "e_w1d2v",
                  "e_w1d2z",
                  "e_w2a1",
                  "e_w2a2",
                  "e_w2a3",
                  "e_w2a4",
                  "e_w2a5",
                  "e_w2a6",
                  "e_w2a7",
                  "e_w2b_masked",
                  "e_w2c_masked",
                  "e_w2d",
                  "e_w3a",
                  "e_w3b",
                  "e_w3c",
                  "e_w3d",
                  "e_w3e",
                  "e_w3f",
                  "e_hw1a",
                  "e_hw1b",
                  "e_hw1c",
                  "e_hw1d",
                  "e_hw1e",
                  "e_hw1f",
                  "e_hw2a",
                  "e_hw2b",
                  "e_hw2c",
                  "e_hw2d",
                  "e_hw2e",
                  "e_hw2f",
                  "e_hw2g",
                  "e_hw2h",
                  "e_hw2i",
                  "e_hw2j",
                  "e_hw2k",
                  "e_hw2l",
                  "e_hw2m",
                  "e_hw2n",
                  "e_hw2o",
                  "e_hw3a",
                  "e_hw3b",
                  "e_hw3c",
                  "e_hw3d",
                  "e_hw3e",
                  "e_hw3f",
                  "e_hw3g",
                  "e_hw3h",
                  "e_hw3i",
                  "e_hw3j",
                  "e_hw3k",
                  "e_hw3l",
                  "e_hw3m",
                  "e_hw4a",
                  "e_hw4b",
                  "e_hw4c",
                  "e_hw4d",
                  "e_hw4e",
                  "e_hw4f",
                  "e_hw4g",
                  "e_hw4h",
                  "e_hw4i",
                  "e_hw4j",
                  "e_hw4k",
                  "e_hw4l",
                  "e_hw4m",
                  "e_hw5a",
                  "e_hw5b",
                  "e_hw5c",
                  "e_hw5d")

capture_tables (indirect_PII)

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age

mydata$educ <- mydata$b_d5a
mydata$educ[is.na(mydata$educ)] <- mydata$b_d4a[is.na(mydata$educ)]
selectedKeyVars = c('female', 'age_imputed', 'educ') ##!!! Replace with candidate categorical demo vars

# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 962 rows and 700 variables.
##   --> Categorical key variables: female, age_imputed, educ
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size           Size of smallest (>0)      
##        female                    2  (2)   481.000 (481.000)                   441 (441)
##   age_imputed                    4  (4)   320.000 (320.000)                    68  (68)
##          educ                   12 (12)    69.364  (69.364)                     1   (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 0 (0.000%)
##   - 3-anonymity: 0 (0.000%)
##   - 5-anonymity: 0 (0.000%)
## 
## ----------------------------------------------------------------------

!!!No records violate 2-anonymity #Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("b_d3_other", "e_v3a_other_transl","e_v4a_other_transl")
report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number

mydata <- mydata[!names(mydata) %in% "b_d3_other"] # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[3] <- "[Kinyarwanda]" # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[4] <- "[Location]" # Redrafted as small location appears.
mydata$e_v4a_other_transl[15] <- "[Kinyarwanda]" # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[18] <- "[Kinyarwanda]" # Drop as actually verbatim data in Kinyarwanda.
mydata$e_v4a_other_transl[16] <- "[School]" # Redrafted as school name appears.

GPS data: Displace

# !!! No GPS

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)