rm(list=ls(all=t))

Setup filenames

filename <- "bhsection3" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!! No Direct PII

Direct PII-team: Encode field team names

# !!! No Direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

dropvars <- c("dise") 
mydata <- mydata[!names(mydata) %in% dropvars]

locvars <- c("q006_block_id", "q007_vlg_id") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## q006_block_id. 6 Block Code
##    1    2    3    4    5    6    7    8    9 <NA> 
##  196  159  201  413   99  200  147  426  517   33 
## [1] "Frequency table after encoding"
## q006_block_id. 6 Block Code
##  279  280  281  282  283  284  285  286  287 <NA> 
##  426  159  201  517  413   99  196  147  200   33 
## [1] "Frequency table before encoding"
## q007_vlg_id. 7 Village Code
##    1    2    3    4    5    6    7    9   10   11   12   13   15   16   17   18   19   20 
##   20   16   16   16   21   31   28   17   15   20   24   24   15   18   24   17   17   18 
##   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38 
##   30   23   18   17   33   27   26   18   14   15   24   24   22   16   31   18   17   22 
##   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56 
##   28   17   16   18   17   29   20   24   21   19   18   17   16   18   26   28   27   18 
##   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74 
##   18   23   13   24   22   16   18   18   30   16   18   21   25   13   16   20   16   23 
##   75   76   77   78   80   81   82   83   84   85   87   88   89   90   91   92   93   94 
##   24   17   22   29   30   17   22   17   17   13   16   22   15   21   19   19   21   13 
##   95   96   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
##   18   22   28   21   25   18   24   22   15   19   19   31   16   27   21   20   21   26 
##  113  114  115  116  117  118  119 <NA> 
##   14   24   19   16   21   22   16   33 
## [1] "Frequency table after encoding"
## q007_vlg_id. 7 Village Code
##  265  266  267  268  269  270  271  272  273  274  275  276  277  278  279  280  281  282 
##   18   25   25   17   22   18   16   20   17   23   16   20   18   21   24   29   17   15 
##  283  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298  299  300 
##   17   22   22   18   28   18   31   16   20   16   18   17   21   16   20   16   23   19 
##  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318 
##   22   15   19   16   17   21   20   21   28   27   14   22   17   17   24   27   29   18 
##  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336 
##   18   16   28   24   22   17   30   13   15   18   30   21   15   16   26   23   24   18 
##  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354 
##   18   19   18   17   16   26   21   13   30   22   28   17   19   22   15   14   13   13 
##  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372 
##   18   24   16   33   24   21   31   16   24   17   18   19   26   27   18   16   31   16 
##  373  374  375  376  377  378  379 <NA> 
##   24   21   22   24   19   24   21   33

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

table(mydata$q302_age_mem)
## 
##  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 25 26 35 42 47 50 
##  1  5  8  5  8  3 10  7 18 19 21 34 27 20 20  2  4  2  6  3  1  1  1  1  1
mydata <- top_recode (variable="q302_age_mem", break_point=18, missing=NA)
## [1] "Frequency table before encoding"
## q302_age_mem. 302 How old is ?
##    3    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21 
##    1    5    8    5    8    3   10    7   18   19   21   34   27   20   20    2    4    2 
##   22   25   26   35   42   47   50 <NA> 
##    6    3    1    1    1    1    1 2163

## [1] "Frequency table after encoding"
## q302_age_mem. 302 How old is ?
##          3          5          6          7          8          9         10         11 
##          1          5          8          5          8          3         10          7 
##         12         13         14         15         16         17 18 or more       <NA> 
##         18         19         21         34         27         20         42       2163

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("q308_location", "q309_travel")
capture_tables (indirect_PII)

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('q302_age_mem', 'q303_gender') ##!!! Replace with candidate categorical demo vars

# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, 
                           keyVars = selectedKeyVars, 
                           hhId = selectedHouseholdID)

sdcInitial
## The input dataset consists of 2391 rows and 27 variables.
##   --> Categorical key variables: q302_age_mem, q303_gender
##   --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size           Size of smallest (>0)     
##  q302_age_mem                   16 (16)    15.200  (15.200)                     1  (1)
##   q303_gender                    3  (3)   114.000 (114.000)                    96 (96)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 3 (0.125%)
##   - 3-anonymity: 9 (0.376%)
##   - 5-anonymity: 30 (1.255%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 3 x 2
##   q302_age_mem q303_gender
##      <dbl+lbl>   <dbl+lbl>
## 1            9  1 [Female]
## 2            3  0 [Male]  
## 3            5  1 [Female]
# Three cases do not meet 2-anonimty by gender and age, but variables critical for analysism, so no further suppressions.  

Open-ends: review responses for any sensitive information, redact as necessary

open_ends <- c("q308_location_othr", "q309_travel_othr")
report_open (list_open_ends = open_ends)
mydata <- mydata[!names(mydata) %in% open_ends] # Drop as actually verbatim data in local language

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)