rm(list=ls(all=t))

Setup filenames

filename <- "bhsection0" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!! No Direct PII

Direct PII-team: Encode field team names

# !!! No Direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

dropvars <- c("dise", "q006_block_name") 
mydata <- mydata[!names(mydata) %in% dropvars]

locvars <- c("q006_block_id", "q007_vlg_id") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## q006_block_id. 6 Block Code
##    1    2    3    4    5    6    7    8    9 <NA> 
##  194  155  195  407   98  190  143  422  516   33 
## [1] "Frequency table after encoding"
## q006_block_id. 6 Block Code
##  279  280  281  282  283  284  285  286  287 <NA> 
##  422  155  195  516  407   98  194  143  190   33 
## [1] "Frequency table before encoding"
## q007_vlg_id. 7 Village Code
##    1    2    3    4    5    6    7    9   10   11   12   13   15   16   17   18   19   20 
##   16   16   16   15   20   31   28   17   15   20   24   24   15   18   21   17   17   18 
##   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38 
##   30   22   18   17   32   27   26   18   14   15   24   24   22   16   29   18   17   22 
##   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56 
##   27   17   16   18   17   28   20   24   21   19   17   17   16   18   26   24   27   18 
##   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74 
##   17   21   13   24   22   16   18   18   29   16   18   21   25   13   16   19   16   23 
##   75   76   77   78   80   81   82   83   84   85   87   88   89   90   91   92   93   94 
##   23   17   22   29   30   17   22   17   17   13   16   22   15   19   19   19   21   13 
##   95   96   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
##   17   22   28   21   25   18   24   21   15   19   14   31   16   27   21   17   21   26 
##  113  114  115  116  117  118  119 <NA> 
##   14   24   19   16   21   22   16   33 
## [1] "Frequency table after encoding"
## q007_vlg_id. 7 Village Code
##  265  266  267  268  269  270  271  272  273  274  275  276  277  278  279  280  281  282 
##   18   25   25   17   22   18   16   20   17   21   16   16   18   20   24   28   17   15 
##  283  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298  299  300 
##   17   22   22   18   24   17   31   16   19   16   18   17   21   15   20   16   23   19 
##  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318 
##   22   15   14   16   17   19   17   21   27   27   14   22   17   17   24   27   29   18 
##  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336 
##   18   16   28   24   22   17   29   13   15   17   30   21   15   16   26   22   24   18 
##  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354 
##   18   19   18   17   16   26   21   13   30   22   28   17   19   21   15   14   13   13 
##  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372 
##   18   21   16   32   24   21   29   16   24   17   18   19   26   27   17   16   31   16 
##  373  374  375  376  377  378  379 <NA> 
##   23   21   22   24   19   24   21   33

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# !!!No Indirect PII - Ordinal

Indirect PII - Categrical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("q012_urban", "s1_relation")
capture_tables (indirect_PII)

# Recode those with very specific values. 

Matching and crosstabulations: Run automated PII check

# Not enough variables for matching possible

Open-ends: review responses for any sensitive information, redact as necessary

# !!! No open-ends

GPS data: Displace

# !!! No GPS

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)