rm(list=ls(all=t))

Setup and crate dictionary

filename <- "Round3_demo" # !!!Update filename
source ("functions_1.5.R")

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables

Direct PII: variables to be removed

# !!!No direct PII variables

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

!!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables=c("Srvyr"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
##       alka.adhikari    ambir.raj.kulung         amrita.roka anjana.kumari.dulal     ashish.shrestha 
##                  79                  96                  89                  98                  82 
## bhanu.bhakta.dhakal       dev.raj.nepal dhan.kumari.darlami       gita.maharjan       kamala.sharma 
##                  77                   2                  85                  99                  78 
##        manjula.giri min.kumari.shrestha       nabina.khadka      niraj.shrestha    pramila.shrestha 
##                  99                  86                  80                  85                  77 
##    pratika.shrestha rabischandra.bhatta   ram.kumar.acharya     sajina.shrestha     sandip.shrestha 
##                  85                  88                  88                  73                  96 
##       sapana.gautam     sarita.shrestha     tirtha.maya.rai        yamuna.karki 
##                  80                  99                 104                  86 
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24 
##  79  96  89  98  82  77   2  85  99  78  99  86  80  85  77  85  88  88  73  96  80  99 104  86

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("ID_4", "VDC") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## ID_4. Ward Number
##   1   2   3   4   5   6   7   8   9  10  11  12  14 
## 263 246 169 237 162 290 209 155 218  11  19  22  10 
## [1] "Frequency table after encoding"
## ID_4. Ward Number
## 980 981 982 983 984 985 986 987 988 989 990 991 992 
## 263 246 169 237 162 290 209 155 218  11  19  22  10 
## [1] "Frequency table before encoding"
## VDC. VDC code
##          Barahathawa         Dhungrekhola             Lalbandi         Malangawa NP            Netraganj 
##                   38                   29                   61                   27                   34 
##             Raniganj            Sankarpur         Bhimeswor NP                Bocha          Dandakharka 
##                   37                   28                   32                   17                   32 
##                Fasku             Katakuti            Lamidanda               Melung               Pawati 
##                   31                   31                   29                   34                   27 
##              Badegau                Irkhu         BhoteNamlang           Talamarang                Ichok 
##                   24                   33                   32                   28                   32 
##             Kadambas            Langarche             Melamchi              Anaikot    Baluwapati Deupur 
##                   25                   32                   34                   25                   21 
##    Chalal Ganeshthan    Kalati Bhumidanda       Mahankal Chaur            Methinkot           Patalekhet 
##                   41                   41                   26                   35                   25 
##              Raviopi               Balkot        Changunarayan             Chitapol              Duwakot 
##                   42                   22                   35                   34                   27 
##                Gundu   Madhyapur Thimi NP              Nankhel              Sirutar              Baireni 
##                   33                   29                   33                   36                   36 
##               Dhussa                Khari            Kiranchok              Naubise            Salyantar 
##                   31                   32                   34                   27                   35 
##        Sunaula Bazar               Thakre             Chitlang           Churiyamai               Fakhel 
##                   33                   24                   27                   28                   30 
##         Padampokhari            Kulekhani            Nibuwatar   Shreepur Chhatiwan Sisneri Mahadevsthan 
##                   39                   37                   32                   37                   29 
##        Birendranagar              Jutpani               Kathar           Khairahani             Padampur 
##                   18                   39                   37                   32                   38 
##           Parbatipur                Piple           Shaktikhor 
##                   31                   37                   36 
## [1] "Frequency table after encoding"
## VDC. VDC code
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 
##  38  29  29  32  27  34  37  28  32  17  32  31  31  29  34  27  24  33  32  28  32  25  32  34  25  21  41  41 
## 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 
##  26  35  25  42  22  35  34  27  33  29  33  36  36  31  32  34  27  35  33  24  27  28  30  39  37  32  37  29 
## 337 338 339 340 341 342 343 344 
##  18  39  37  32  38  31  37  36

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" of 10 or less. 

haven_table("D_2")
## D_2. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53 52 36 35 37 51 34 42 28 41 44 31 36 34 37 34 32 28 31 29 
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
## 28 23 27 19 25 32 15 18 17 18 13 13  9 12  2  1
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1, 
                "25-34" =2, 
                "35-44" =3, 
                "45-54" =4, 
                "55-64" =5, 
                "65 and older" =6, 
                "NA" = 7)
mydata <- ordinal_recode (variable="D_2", break_points=break_age, missing=999999, value_labels=labels_age)

##     recoded
##      [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
##   16      18       0       0       0       0        0           0
##   17      44       0       0       0       0        0           0
##   18      60       0       0       0       0        0           0
##   19      62       0       0       0       0        0           0
##   20      63       0       0       0       0        0           0
##   21      72       0       0       0       0        0           0
##   22      68       0       0       0       0        0           0
##   23      72       0       0       0       0        0           0
##   24      78       0       0       0       0        0           0
##   25       0      60       0       0       0        0           0
##   26       0      49       0       0       0        0           0
##   27       0      41       0       0       0        0           0
##   28       0      35       0       0       0        0           0
##   29       0      53       0       0       0        0           0
##   30       0      59       0       0       0        0           0
##   31       0      50       0       0       0        0           0
##   32       0      56       0       0       0        0           0
##   33       0      54       0       0       0        0           0
##   34       0      53       0       0       0        0           0
##   35       0       0      52       0       0        0           0
##   36       0       0      36       0       0        0           0
##   37       0       0      35       0       0        0           0
##   38       0       0      37       0       0        0           0
##   39       0       0      51       0       0        0           0
##   40       0       0      34       0       0        0           0
##   41       0       0      42       0       0        0           0
##   42       0       0      28       0       0        0           0
##   43       0       0      41       0       0        0           0
##   44       0       0      44       0       0        0           0
##   45       0       0       0      31       0        0           0
##   46       0       0       0      36       0        0           0
##   47       0       0       0      34       0        0           0
##   48       0       0       0      37       0        0           0
##   49       0       0       0      34       0        0           0
##   50       0       0       0      32       0        0           0
##   51       0       0       0      28       0        0           0
##   52       0       0       0      31       0        0           0
##   53       0       0       0      29       0        0           0
##   54       0       0       0      28       0        0           0
##   55       0       0       0       0      23        0           0
##   56       0       0       0       0      27        0           0
##   57       0       0       0       0      19        0           0
##   58       0       0       0       0      25        0           0
##   59       0       0       0       0      32        0           0
##   60       0       0       0       0      15        0           0
##   61       0       0       0       0      18        0           0
##   62       0       0       0       0      17        0           0
##   63       0       0       0       0      18        0           0
##   64       0       0       0       0      13        0           0
##   65       0       0       0       0       0       13           0
##   66       0       0       0       0       0        9           0
##   67       0       0       0       0       0       12           0
##   68       0       0       0       0       0        2           0
##   69       0       0       0       0       0        1           0
## D_2. How old are you?
##        15-24        25-34        35-44        45-54        55-64 65 and older 
##          537          510          400          320          207           37 
## [1] "Inspect value labels and relabel as necessary"
##        15-24        25-34        35-44        45-54        55-64 65 and older           NA 
##            1            2            3            4            5            6            7
break_edu <- c(0,6,9,11,12,13,16,18,19, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
                "Lower secondary (6-8)" = 2,
                "Secondary (9-10)" = 3,
                "SLC (11)" = 4,
                "CLASS 12/Intermediate level (12)" = 5,
                "Bachelor/Postgraduate level" = 6,
                "Literate, but never attended school" = 7,
                "Illiterate, and never attended school"= 8,
                "Refused"= 9, 
                "Does not apply" = 10, 
                "Don't Know" = 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999, value_labels=labels_edu)

##     recoded
##      [0,6) [6,9) [9,11) [11,12) [12,13) [13,16) [16,18) [18,19) [19,777) [777,888) [888,999) [999,1e+03)
##   0      1     0      0       0       0       0       0       0        0         0         0           0
##   1     31     0      0       0       0       0       0       0        0         0         0           0
##   2     54     0      0       0       0       0       0       0        0         0         0           0
##   3     71     0      0       0       0       0       0       0        0         0         0           0
##   4     73     0      0       0       0       0       0       0        0         0         0           0
##   5    149     0      0       0       0       0       0       0        0         0         0           0
##   6      0    69      0       0       0       0       0       0        0         0         0           0
##   7      0    85      0       0       0       0       0       0        0         0         0           0
##   8      0   120      0       0       0       0       0       0        0         0         0           0
##   9      0     0     84       0       0       0       0       0        0         0         0           0
##   10     0     0    102       0       0       0       0       0        0         0         0           0
##   11     0     0      0     296       0       0       0       0        0         0         0           0
##   12     0     0      0       0     264       0       0       0        0         0         0           0
##   13     0     0      0       0       0      62       0       0        0         0         0           0
##   14     0     0      0       0       0       9       0       0        0         0         0           0
##   17     0     0      0       0       0       0     304       0        0         0         0           0
##   18     0     0      0       0       0       0       0     237        0         0         0           0
## D_4. What is your highest completed education level?  [You do not need to read the re
##                 Primary or less (0-5)                 Lower secondary (6-8)                      Secondary (9-10) 
##                                   379                                   274                                   186 
##                              SLC (11)      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                   296                                   264                                    71 
##   Literate, but never attended school Illiterate, and never attended school 
##                                   304                                   237 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8)                      Secondary (9-10) 
##                                     1                                     2                                     3 
##                              SLC (11)      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     4                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school                               Refused 
##                                     7                                     8                                     9 
##                        Does not apply                            Don't Know 
##                                    10                                    11
# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("D_20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members. 
## [1] "Frequency table after encoding"
## D_20. How many LIVING sons and daughters do you have?
##   0   1   2   3   4   5   6   7   8   9  10  11 888 
## 375 243 523 375 188 113  70  25   6   1   1   1  90

## [1] "Frequency table after encoding"
## D_20. How many LIVING sons and daughters do you have?
##         0         1         2         3         4 5 or more       888 
##       375       243       523       375       188       217        90
# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
##      0      5     50     60    400    500    600    700    777    888    999   1000   1200   1500   1600   2000 
##    132      1      1      1      1      5      2      1      4      1     22      3      1      4      5     26 
##   2400   2500   2600   3000   3500   4000   4500   4800   5000   6000   6500   7000   8000   8500   9000  10000 
##      2      8      1     33      2     42      3      1     99     34      1     38     20      1     24    193 
##  11000  12000  12846  13000  14000  14500  15000  15500  16000  17000  18000  19000  19135  20000  21000  22000 
##      3     54      1     11     10      1    174      1     23      8     15      6      1    220      2     12 
##  22500  23000  24000  25000  26000  27000  27500  28000  30000  32000  33000  34000  35000  36000  37000  40000 
##      1      4      5    105      1      5      1      2    177      2      1      1     54      4      1     88 
##  41000  45000  48000  50000  54000  55000  57000  60000  62000  65000  66000  67000  68000  70000  75000  79000 
##      3     17      1    113      2      4      2     48      1      7      1      1      1     15      4      1 
##  79500  80000  85000  95000  1e+05 103000 104000 110000 115000 117000 125000 130000 135000 150000 160000 170000 
##      1     13      1      1     27      1      1      1      2      1      1      1      1     11      1      1 
##  2e+05 240000 250000  3e+05 320000  5e+05  6e+05  7e+05  1e+06 
##      9      1      1      3      1      1      1      1      1

## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
##             0             5            50            60           400           500           600           700 
##           132             1             1             1             1             5             2             1 
##           777           888           999          1000          1200          1500          1600          2000 
##             4             1            22             3             1             4             5            26 
##          2400          2500          2600          3000          3500          4000          4500          4800 
##             2             8             1            33             2            42             3             1 
##          5000          6000          6500          7000          8000          8500          9000         10000 
##            99            34             1            38            20             1            24           193 
##         11000         12000         12846         13000         14000         14500         15000         15500 
##             3            54             1            11            10             1           174             1 
##         16000         17000         18000         19000         19135         20000         21000         22000 
##            23             8            15             6             1           220             2            12 
##         22500         23000         24000         25000         26000         27000         27500         28000 
##             1             4             5           105             1             5             1             2 
##         30000         32000         33000         34000         35000         36000         37000         40000 
##           177             2             1             1            54             4             1            88 
##         41000         45000         48000         50000         54000         55000         57000         60000 
##             3            17             1           113             2             4             2            48 
##         62000         65000         66000         67000         68000         70000         75000         79000 
##             1             7             1             1             1            15             4             1 
##         79500         80000         85000         95000         1e+05        103000        104000        110000 
##             1            13             1             1            27             1             1             1 
##        115000        117000        125000        130000        135000        150000        160000        170000 
##             2             1             1             1             1            11             1             1 
## 2e+05 or more 
##            19

Indirect PII - Categorical: Global recode or Top/bottom coding for extreme values

# !!!Include relevant variables in list below

indirect_PII <- c("D_1",
                  "D_9",
                  "Inc_16",
                  "Inc_20a",
                  "HT_1cx1")
capture_tables (indirect_PII)

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_1', 'D_2', 'D_4') ##!!! Replace with candidate categorical demo vars

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 2011 rows and 29 variables.
##   --> Categorical key variables: D_1, D_2, D_4
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories     Mean size            Size of smallest (>0)      
##           D_1                    2 (2)  1005.500 (1005.500)                   965 (965)
##           D_2                    6 (6)   335.167  (335.167)                    37  (37)
##           D_4                    8 (8)   251.375  (251.375)                    71  (71)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 7 (0.348%)
##   - 3-anonymity: 17 (0.845%)
##   - 5-anonymity: 38 (1.890%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 7 x 3
##          D_1       D_2                                       D_4
##    <dbl+lbl> <dbl+lbl>                                 <dbl+lbl>
## 1 1 [Male]   5 [55-64] 6 [Bachelor/Postgraduate level]          
## 2 2 [Female] 5 [55-64] 4 [SLC (11)]                             
## 3 2 [Female] 3 [35-44] 6 [Bachelor/Postgraduate level]          
## 4 2 [Female] 1 [15-24] 7 [Literate, but never attended school]  
## 5 1 [Male]   1 [15-24] 8 [Illiterate, and never attended school]
## 6 1 [Male]   1 [15-24] 7 [Literate, but never attended school]  
## 7 2 [Female] 5 [55-64] 5 [CLASS 12/Intermediate level (12)]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
##      D_1 D_2 D_4
## 492    1   5  NA
## 545    2   5  NA
## 696    2   3  NA
## 1154   2   1  NA
## 1322   1   1  NA
## 1435   1   1  NA
## 1839   2   5  NA
mydata [notAnon,"D_4"] <- 999

Open-ends: review responses for any sensitive information, redact as necessary

# !!!No open ends

GPS data: Displace

# !!!No GPS

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))