rm(list=ls(all=t))

Setup and crate dictionary

filename <- "Round1_demo" # !!!Update filename
source ("functions_1.5.R")

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format, add "DatasetReview" to name and continue processing data with subset of flagged variables

Direct PII: variables to be removed

# !!!No direct PII variables

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

# !!!No direct PII - team variables

Small locations: Encode locations with pop <100,000 using random large numbers

# !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("VDC") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## VDC. VDC code
##         Barahathawa        Dhungrekhola           Dhurkauli            Lalbandi      Malangawa N.P. 
##                  63                  66                  63                  63                  63 
##           Netraganj            Raniganj           Sankarpur      Bhimeswor N.P.               Bocha 
##                  64                  65                  62                  61                  61 
##         Dandakharka               Fasku            Katakuti            Lamidada              Melung 
##                  62                  62                  63                  62                  64 
##              Pawati             Badegau         Talramarang        BhoteNamlang               Irkhu 
##                  64                  64                  64                  66                  62 
##               Ichok            Kadambas           Langarche            Melamchi             Anaikot 
##                  63                  64                  62                  64                  63 
##    BaluwapatiDeupur   ChalalGaneshsthan    KalatiBhumidanda       MahankalChaur           Methinkot 
##                  62                  67                  62                  61                  62 
##          Patalekhet             RaviOpi              Balkot       Changunarayan            Chitapol 
##                  64                  62                  59                  62                  63 
##             Duwakot               Gundu  Madhyapur Thimi NP             Nankhel             Sirutar 
##                  63                  63                  66                  61                  58 
##             Baireni              Dhussa               Khari           Kiranchok             Naubise 
##                  62                  64                  62                  63                  64 
##           Salyantar        SunaulaBazar              Thakre            Chitlang          Churiyamai 
##                  63                  62                  64                  61                  63 
##              Fakhel           Kulekhani           Nibuwatar        Padampokhari   ShreepurChhatiwan 
##                  62                  62                  60                  65                  62 
## SisneriMahadevsthan       Birendranagar             Jutpani              Kathar          Khairahani 
##                  63                  63                  62                  63                  63 
##            Padampur          Parbatipur               Piple          Shaktikhor 
##                  62                  62                  65                  64 
## [1] "Frequency table after encoding"
## VDC. VDC code
##  980  981  982  983  984  985  986  987  988  989  990  991  992  993  994  995  996  997  998  999 1000 1001 1002 
##   63   66   63   63   63   64   65   62   61   61   62   62   63   62   64   64   64   64   66   62   63   64   62 
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 
##   64   63   62   67   62   61   62   64   62   59   62   63   63   63   66   61   58   62   64   62   63   64   63 
## 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 
##   62   64   61   63   62   62   60   65   62   63   63   62   63   63   62   62   65   64

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" of 10 or less. 

break_age <- c(0, 15,25,35,45,55,100)
labels_age <- c("Less than 15" =1, 
                "15-24" =2, 
                "25-34" =3, 
                "35-44" =4, 
                "45-54" =5, 
                "55 and older" =6,
                "NA" = 7)
mydata <- ordinal_recode (variable="D_2", break_points=break_age, missing=999999, value_labels=labels_age)

##     recoded
##      [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
##   13    132       0       0       0       0        0           0
##   14    171       0       0       0       0        0           0
##   15      0     199       0       0       0        0           0
##   16      0     244       0       0       0        0           0
##   17      0     220       0       0       0        0           0
##   18      0     210       0       0       0        0           0
##   19      0     171       0       0       0        0           0
##   20      0     193       0       0       0        0           0
##   21      0     150       0       0       0        0           0
##   22      0     138       0       0       0        0           0
##   23      0      99       0       0       0        0           0
##   24      0      81       0       0       0        0           0
##   25      0       0     109       0       0        0           0
##   26      0       0     100       0       0        0           0
##   27      0       0      93       0       0        0           0
##   28      0       0      94       0       0        0           0
##   29      0       0      81       0       0        0           0
##   30      0       0     115       0       0        0           0
##   31      0       0      71       0       0        0           0
##   32      0       0      68       0       0        0           0
##   33      0       0      59       0       0        0           0
##   34      0       0      63       0       0        0           0
##   35      0       0       0      77       0        0           0
##   36      0       0       0      46       0        0           0
##   37      0       0       0      56       0        0           0
##   38      0       0       0      42       0        0           0
##   39      0       0       0      54       0        0           0
##   40      0       0       0      74       0        0           0
##   41      0       0       0      32       0        0           0
##   42      0       0       0      54       0        0           0
##   43      0       0       0      44       0        0           0
##   44      0       0       0      51       0        0           0
##   45      0       0       0       0      56        0           0
##   46      0       0       0       0      46        0           0
##   47      0       0       0       0      37        0           0
##   48      0       0       0       0      50        0           0
##   49      0       0       0       0      40        0           0
##   50      0       0       0       0      43        0           0
##   51      0       0       0       0      33        0           0
##   52      0       0       0       0      33        0           0
##   53      0       0       0       0      32        0           0
##   54      0       0       0       0      34        0           0
##   55      0       0       0       0       0       44           0
##   56      0       0       0       0       0       24           0
##   57      0       0       0       0       0       21           0
##   58      0       0       0       0       0       28           0
##   59      0       0       0       0       0       24           0
##   60      0       0       0       0       0       31           0
##   61      0       0       0       0       0       11           0
##   62      0       0       0       0       0       21           0
##   63      0       0       0       0       0       11           0
##   64      0       0       0       0       0        9           0
##   65      0       0       0       0       0        1           0
##   66      0       0       0       0       0        1           0
## D_2. How old are you?<U+00A0>     [Use the timeline in the manual if the respondent has a ha
## Less than 15        15-24        25-34        35-44        45-54 55 and older 
##          303         1705          853          530          404          226 
## [1] "Inspect value labels and relabel as necessary"
## Less than 15        15-24        25-34        35-44        45-54 55 and older           NA 
##            1            2            3            4            5            6            7
break_edu <- c(0,6,9,11,12,13,16,17,18, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
                "Lower secondary (6-8)" = 2,
                "Secondary (9-10)" = 3,
                "SLC (11)" = 4,
                "CLASS 12/Intermediate level (12)" = 5,
                "Bachelor/Postgraduate level" = 6,
                "Literate, but never attended school" = 7,
                "Illiterate, and never attended school"= 8,
                "Refused"= 9, 
                "Does not apply" = 10, 
                "Don't Know" = 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999, value_labels=labels_edu)

##     recoded
##      [0,6) [6,9) [9,11) [11,12) [12,13) [13,16) [16,17) [17,18) [18,777) [777,888) [888,999) [999,1e+03)
##   0      6     0      0       0       0       0       0       0        0         0         0           0
##   1     85     0      0       0       0       0       0       0        0         0         0           0
##   2    136     0      0       0       0       0       0       0        0         0         0           0
##   3    151     0      0       0       0       0       0       0        0         0         0           0
##   4    194     0      0       0       0       0       0       0        0         0         0           0
##   5    264     0      0       0       0       0       0       0        0         0         0           0
##   6      0   218      0       0       0       0       0       0        0         0         0           0
##   7      0   287      0       0       0       0       0       0        0         0         0           0
##   8      0   323      0       0       0       0       0       0        0         0         0           0
##   9      0     0    238       0       0       0       0       0        0         0         0           0
##   10     0     0    240       0       0       0       0       0        0         0         0           0
##   11     0     0      0     640       0       0       0       0        0         0         0           0
##   12     0     0      0       0     363       0       0       0        0         0         0           0
##   13     0     0      0       0       0      59       0       0        0         0         0           0
##   14     0     0      0       0       0      15       0       0        0         0         0           0
##   17     0     0      0       0       0       0       0     299        0         0         0           0
##   18     0     0      0       0       0       0       0       0      490         0         0           0
## D_4. What is your highest completed education level?<U+00A0>    [You do not need to read the
##                 Primary or less (0-5)                 Lower secondary (6-8)                      Secondary (9-10) 
##                                   836                                   828                                   478 
##                              SLC (11)      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                   640                                   363                                    74 
## Illiterate, and never attended school                               Refused                                  <NA> 
##                                   299                                   490                                    13 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8)                      Secondary (9-10) 
##                                     1                                     2                                     3 
##                              SLC (11)      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     4                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school                               Refused 
##                                     7                                     8                                     9 
##                        Does not apply                            Don't Know 
##                                    10                                    11
# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("D_20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members. 
## [1] "Frequency table after encoding"
## D_20. How many children do you have?
##    0    1    2    3    4    5    6    7    8    9   10   11  888 
## 1816  443  662  507  267  153   92   29   12    2    2    2   34

## [1] "Frequency table after encoding"
## D_20. How many children do you have?
##         0         1         2         3         4 5 or more       888 
##      1816       443       662       507       267       292        34
# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS)
##      0     15     30     35     36    300    400    500    600    777    888    999   1000   1200   1250   1300 
##    132      1      1      1      1      1      1      6      1     49      1    249     25      9      1      1 
##   1500   1600   2000   2083   2100   2200   2300   2500   2600   3000   3500   3800   4000   4100   4500   5000 
##     25      2     85      1      1      1      1     16      1    140      4      1    129      1      8    352 
##   5500   6000   6500   7000   7500   8000   9000   9500  10000  10400  10500  11000  12000  12200  12500  13000 
##      2    142      2    121      8    111     63      1    399      1      1      9    155      1      6     40 
##  14000  15000  16000  17000  17200  18000  19000  20000  21000  22000  23000  24000  24400  25000  26000  27000 
##     20    408     35     16      1     39      2    338      7     16      5      7      1    193      7      3 
##  28000  29000  30000  31000  32000  33000  34500  35000  36000  38000  40000  41000  41600  42000  43000  45000 
##      6      2    176      2      2      4      1     61      2      1     80      1      1      2      2     22 
##  50000  53000  55000  58000  60000  65000  66000  70000  75000  80000  85000  90000  1e+05 108000 110000 120000 
##     93      1      4      2     40      1      1     11      1     15      1      5     29      1      1      2 
## 125000 130000 150000  2e+05 320000 350000  4e+05  5e+05  7e+05  2e+06 
##      2      1     13     18      1      1      1      1      1      1

## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS)
##             0            15            30            35            36           300           400           500 
##           132             1             1             1             1             1             1             6 
##           600           777           888           999          1000          1200          1250          1300 
##             1            49             1           249            25             9             1             1 
##          1500          1600          2000          2083          2100          2200          2300          2500 
##            25             2            85             1             1             1             1            16 
##          2600          3000          3500          3800          4000          4100          4500          5000 
##             1           140             4             1           129             1             8           352 
##          5500          6000          6500          7000          7500          8000          9000          9500 
##             2           142             2           121             8           111            63             1 
##         10000         10400         10500         11000         12000         12200         12500         13000 
##           399             1             1             9           155             1             6            40 
##         14000         15000         16000         17000         17200         18000         19000         20000 
##            20           408            35            16             1            39             2           338 
##         21000         22000         23000         24000         24400         25000         26000         27000 
##             7            16             5             7             1           193             7             3 
##         28000         29000         30000         31000         32000         33000         34500         35000 
##             6             2           176             2             2             4             1            61 
##         36000         38000         40000         41000         41600         42000         43000         45000 
##             2             1            80             1             1             2             2            22 
##         50000         53000         55000         58000         60000         65000         66000         70000 
##            93             1             4             2            40             1             1            11 
##         75000         80000         85000         90000         1e+05        108000        110000        120000 
##             1            15             1             5            29             1             1             2 
##        125000        130000        150000 2e+05 or more 
##             2             1            13            24
percentile_99.5 <- floor(quantile(mydata$Inc_23[mydata$Inc_23!=999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_23", break_point=percentile_99.5, missing=999)
## [1] "Frequency table after encoding"
## Inc_23. In a typical month, what is your total household expenditure? (in NRS)
##     15    200    300    400    500    600    700    777    800    900    999   1000   1070   1100   1200   1300 
##      1      1      2      1     16      1      1      3      1      1    135     35      1      1     12      1 
##   1400   1500   1600   1800   2000   2200   2500   2600   3000   3500   4000   4009   4500   5000   5500   6000 
##      1     29      1      1    131      3     52      1    255     23    260      1     14    559      1    267 
##   7000   7500   8000   9000  10000  11000  12000  13000  13500  14000  15000  16000  17000  18000  19000  20000 
##    221      3    225     82    627      4    152     40      1     13    361      8      7     21      1    200 
##  21000  22000  23000  24000  25000  27000  30000  32000  32500  35000  40000  42000  45000  50000  60000  70000 
##      4     11      3      2     85      3     68      2      1     13     17      1      4     11      8      2 
##  80000  90000  1e+05 120000  2e+05  9e+05 
##      2      1      2      1      1      1

## [1] "Frequency table after encoding"
## Inc_23. In a typical month, what is your total household expenditure? (in NRS)
##            15           200           300           400           500           600           700           777 
##             1             1             2             1            16             1             1             3 
##           800           900           999          1000          1070          1100          1200          1300 
##             1             1           135            35             1             1            12             1 
##          1400          1500          1600          1800          2000          2200          2500          2600 
##             1            29             1             1           131             3            52             1 
##          3000          3500          4000          4009          4500          5000          5500          6000 
##           255            23           260             1            14           559             1           267 
##          7000          7500          8000          9000         10000         11000         12000         13000 
##           221             3           225            82           627             4           152            40 
##         13500         14000         15000         16000         17000         18000         19000         20000 
##             1            13           361             8             7            21             1           200 
##         21000         22000         23000         24000         25000         27000         30000         32000 
##             4            11             3             2            85             3            68             2 
##         32500         35000         40000         42000         45000 50000 or more 
##             1            13            17             1             4            29

Indirect PII - Categorical: Global recode or Top/bottom coding for extreme values

# !!!Include relevant variables in list below

indirect_PII <- c("D_1",
                  "D_3",
                  "D_6",
                  "D_9",
                  "D_20",
                  "Inc_16",
                  "Inc_20a",
                  "ME_1",
                  "ME_3",
                  "ME_5",
                  "ME_7",
                  "ME_13",
                  "ME_14",
                  "ME_16",
                  "ME_17")

capture_tables (indirect_PII)

# Recode those with very specific values where more than half of the sample have actual data. 

mydata <- encode_location (variables= "D_3", missing=999999)
## [1] "Frequency table before encoding"
## D_3. What is your ethnic background?    [You do not need to read the response choices
##            chhetri     BRAHMAN (HILL)              magar              tharu             tamang              newar 
##                846                619                206                 61               1078                383 
##             muslim               kami              yadav                rai             gurung       DAMAIN/DHOLI 
##                  5                117                 13                 11                 50                 70 
##              limbu            thakuri              sarki               teli CHAMAR/HARIJAN/RAM              koiri 
##                  2                 42                 66                  6                  2                 74 
##              kurmi DUSADH/PASWAN/PASI              sonar    BRAHMAN (TARAI)      GHARTI/BHUJEL              malla 
##                  1                  6                  5                  6                 48                  1 
##             kalwar              kumal       HAJAM/THAKUR            sunuwar              sudhi              lohar 
##                 11                 25                  2                  2                  1                  3 
##              tatma             khatwe              majhi             nuniya             kumhar            danuwar 
##                  3                  1                  6                  2                  3                  4 
##      CHEPANG/PRAJA            haluwai             rajput           kayastha             badhae            marwadi 
##                 92                  1                  3                  8                  1                  4 
##              thami              darai             pahari                dom               bote   ADIBASI/JANAJATI 
##                 18                 14                  8                  1                  1                  1 
##        OTHER CASTE               <NA> 
##                 87                  1 
## [1] "Frequency table after encoding"
## D_3. What is your ethnic background?    [You do not need to read the response choices
##  980  981  982  983  984  985  986  987  988  989  990  991  992  993  994  995  996  997  998  999 1000 1001 1002 
##  846  619  206   61 1078  383    5  117   13   11   50   70    2   42   66    6    2   74    1    6    5    6   48 
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 
##    1   11   25    2    2    1    3    3    1    6    2    3    4   92    1    3    8    1    4   18   14    8    1 
## 1026 1027 1028 <NA> 
##    1    1   87    1
break_rel <- c(1,2,3, 777, 888, 999)
labels_rel <- c("Hindu" = 1,
                "Buddhist" = 2,
                "Other" = 3,
                "Refused" = 4,
                "Not applicable" = 5,
                "Don't know" = 6)
mydata <- ordinal_recode (variable="D_6", break_points=break_rel, missing=999, value_labels=labels_rel)

##    recoded
##     [1,2) [2,3) [3,777) [777,888) [888,999) [999,1e+03)
##   1  2964     0       0         0         0           0
##   2     0   941       0         0         0           0
##   3     0     0       9         0         0           0
##   4     0     0       2         0         0           0
##   6     0     0      97         0         0           0
##   9     0     0       3         0         0           0
## D_6. What is your religious background?    [You do not need to read the response choi
##    Hindu Buddhist    Other     <NA> 
##     2964      941      111        5 
## [1] "Inspect value labels and relabel as necessary"
##          Hindu       Buddhist          Other        Refused Not applicable     Don't know 
##              1              2              3              4              5              6

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_1', 'D_2', 'D_4') ##!!! Replace with candidate categorical demo vars

# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 4021 rows and 31 variables.
##   --> Categorical key variables: D_1, D_2, D_4
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories     Mean size            Size of smallest (>0)       
##           D_1                    2 (2)  2010.500 (2010.500)                  2006 (2006)
##           D_2                    6 (6)   670.167  (670.167)                   226  (226)
##           D_4                    9 (9)   501.000  (501.000)                    74   (74)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 2 (0.050%)
##   - 3-anonymity: 3 (0.075%)
##   - 5-anonymity: 23 (0.572%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 2 x 3
##          D_1              D_2                             D_4
##    <dbl+lbl>        <dbl+lbl>                       <dbl+lbl>
## 1 2 [Female] 1 [Less than 15] 4 [SLC (11)]                   
## 2 2 [Female] 5 [45-54]        6 [Bachelor/Postgraduate level]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
##      D_1 D_2 D_4
## 226    2   1  NA
## 2180   2   5  NA
mydata [notAnon,"D_4"] <- 999

Open-ends: review responses for any sensitive information, redact as necessary

# !!!No open ends

GPS data: Displace

# !!!No GPS

Save processed data in Stata and SPSS format

# Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))