rm(list=ls(all=t))

Setup filenames

filename <- "Nepal_HT_Study_Rounds_1_2_3_Processed" # !!!Update filename
functions_vers <-  "functions_1.6.R" # !!!Update helper functions file

Setup and crate dictionary

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables

Direct PII: variables to be removed

# !!!No direct PII

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

!!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables=c("Srvyr"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
##                           alka.adhikari    ambir.raj.kulung         amrita.roka anjana.kumari.dulal 
##               15052                  79                  96                  89                  98 
##     ashish.shrestha bhanu.bhakta.dhakal       dev.raj.nepal dhan.kumari.darlami       gita.maharjan 
##                  82                  77                   2                  85                  99 
##       kamala.sharma        manjula.giri min.kumari.shrestha       nabina.khadka      niraj.shrestha 
##                  78                  99                  86                  80                  85 
##    pramila.shrestha    pratika.shrestha rabischandra.bhatta   ram.kumar.acharya     sajina.shrestha 
##                  77                  85                  88                  88                  73 
##     sandip.shrestha       sapana.gautam     sarita.shrestha     tirtha.maya.rai        yamuna.karki 
##                  96                  80                  99                 104                  86 
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
##     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17    18    19 
## 15052    79    96    89    98    82    77     2    85    99    78    99    86    80    85    77    85    88    88 
##    20    21    22    23    24    25 
##    73    96    80    99   104    86

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("dist_vdc", "ward") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## dist_vdc. VDC code
##         Barahathawa        Dhungrekhola           Dhurkauli            Lalbandi      Malangawa N.P. 
##                 227                 224                 214                 219                 216 
##           Netraganj            Raniganj           Sankarpur      Bhimeswor N.P.               Bocha 
##                 226                 231                 213                 215                 200 
##         Dandakharka               Fasku            Katakuti            Lamidada              Melung 
##                 218                 217                 220                 215                 226 
##              Pawati             Badegau         Talramarang        BhoteNamlang               Irkhu 
##                 218                 215                 225                 229                 214 
##               Ichok            Kadambas           Langarche            Melamchi             Anaikot 
##                 220                 216                 218                 224                 214 
##    BaluwapatiDeupur   ChalalGaneshsthan    KalatiBhumidanda       MahankalChaur           Methinkot 
##                 207                 242                 227                 209                 221 
##          Patalekhet             RaviOpi              Balkot       Changunarayan            Chitapol 
##                 217                 228                 199                 221                 223 
##             Duwakot               Gundu  Madhyapur Thimi NP             Nankhel             Sirutar 
##                 216                 222                 227                 216                 210 
##             Baireni              Dhussa               Khari           Kiranchok             Naubise 
##                 222                 223                 218                 223                 218 
##           Salyantar        SunaulaBazar              Thakre            Chitlang          Churiyamai 
##                 224                 219                 215                 210                 217 
##              Fakhel           Kulekhani           Nibuwatar        Padampokhari   ShreepurChhatiwan 
##                 216                 223                 212                 234                 223 
## SisneriMahadevsthan       Birendranagar             Jutpani              Kathar          Khairahani 
##                 218                 207                 225                 226                 221 
##            Padampur          Parbatipur               Piple          Shaktikhor       Chhayachhetra 
##                 224                 215                 232                 226                 188 
##           Damachaur            Devsthal            Dhanwang           Phalawang           Sibaratha 
##                 189                 195                 190                 192                 186 
##          Siddheswar             Tribeni            Baijapur             Binauna           Chisapani 
##                 189                 187                 189                 186                 195 
##           Khaskusma            Kohalpur           Nepalgunj             Rajhena          Samserganj 
##                 186                 186                 180                 186                 189 
## [1] "Frequency table after encoding"
## dist_vdc. VDC code
## 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 
## 234 222 186 214 228 217 224 215 199 188 214 227 220 216 210 186 216 215 219 232 195 210 209 221 223 226 226 215 226 
## 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 
## 200 216 221 180 189 189 215 195 216 213 223 226 207 227 207 222 189 218 217 216 187 231 224 190 186 218 223 186 225 
## 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 
## 221 220 242 214 215 218 217 223 227 219 189 186 212 218 218 224 225 218 223 224 229 192 
## [1] "Frequency table before encoding"
## ward. Ward Number
##     1     2     3     4     5     6     7     8     9    10    11    12    13    14  <NA> 
##   913   883   563   813   566   940   737   578   787    36    47    60    38    48 10054 
## [1] "Frequency table after encoding"
## ward. Ward Number
##   235   236   237   238   239   240   241   242   243   244   245   246   247   248  <NA> 
##    60   787   883   737   813   578   566   913   563   940    36    48    38    47 10054

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" of 10 or less. 

break_age <- c(0, 15,25,35,45,55,100)
labels_age <- c("Less than 15" =1, 
                "15-24" =2, 
                "25-34" =3, 
                "35-44" =4, 
                "45-54" =5, 
                "55 and older" =6, 
                "NA" = 7)
mydata <- ordinal_recode (variable="age", break_points=break_age, missing=999999, value_labels=labels_age)

## [1] "Frequency table before encoding"
## age. Age
##  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41 
## 564 717 845 955 877 804 701 786 629 563 459 331 490 441 411 384 365 530 294 328 264 263 361 217 260 195 228 337 152 
##  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66 
## 246 198 205 245 208 154 218 166 172 143 131 136 131 190  98  93 111  95 141  55  92  41  34   6   3 
##     recoded
##      [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
##   13    564       0       0       0       0        0           0
##   14    717       0       0       0       0        0           0
##   15      0     845       0       0       0        0           0
##   16      0     955       0       0       0        0           0
##   17      0     877       0       0       0        0           0
##   18      0     804       0       0       0        0           0
##   19      0     701       0       0       0        0           0
##   20      0     786       0       0       0        0           0
##   21      0     629       0       0       0        0           0
##   22      0     563       0       0       0        0           0
##   23      0     459       0       0       0        0           0
##   24      0     331       0       0       0        0           0
##   25      0       0     490       0       0        0           0
##   26      0       0     441       0       0        0           0
##   27      0       0     411       0       0        0           0
##   28      0       0     384       0       0        0           0
##   29      0       0     365       0       0        0           0
##   30      0       0     530       0       0        0           0
##   31      0       0     294       0       0        0           0
##   32      0       0     328       0       0        0           0
##   33      0       0     264       0       0        0           0
##   34      0       0     263       0       0        0           0
##   35      0       0       0     361       0        0           0
##   36      0       0       0     217       0        0           0
##   37      0       0       0     260       0        0           0
##   38      0       0       0     195       0        0           0
##   39      0       0       0     228       0        0           0
##   40      0       0       0     337       0        0           0
##   41      0       0       0     152       0        0           0
##   42      0       0       0     246       0        0           0
##   43      0       0       0     198       0        0           0
##   44      0       0       0     205       0        0           0
##   45      0       0       0       0     245        0           0
##   46      0       0       0       0     208        0           0
##   47      0       0       0       0     154        0           0
##   48      0       0       0       0     218        0           0
##   49      0       0       0       0     166        0           0
##   50      0       0       0       0     172        0           0
##   51      0       0       0       0     143        0           0
##   52      0       0       0       0     131        0           0
##   53      0       0       0       0     136        0           0
##   54      0       0       0       0     131        0           0
##   55      0       0       0       0       0      190           0
##   56      0       0       0       0       0       98           0
##   57      0       0       0       0       0       93           0
##   58      0       0       0       0       0      111           0
##   59      0       0       0       0       0       95           0
##   60      0       0       0       0       0      141           0
##   61      0       0       0       0       0       55           0
##   62      0       0       0       0       0       92           0
##   63      0       0       0       0       0       41           0
##   64      0       0       0       0       0       34           0
##   65      0       0       0       0       0        6           0
##   66      0       0       0       0       0        3           0
## [1] "Frequency table after encoding"
## age. Age
## Less than 15        15-24        25-34        35-44        45-54 55 and older 
##         1281         6950         3770         2399         1704          959 
## [1] "Inspect value labels and relabel as necessary"
## Less than 15        15-24        25-34        35-44        45-54 55 and older           NA 
##            1            2            3            4            5            6            7
mydata <- ordinal_recode (variable="ager", break_points=break_age, missing=999999, value_labels=labels_age)

## [1] "Frequency table before encoding"
## ager. Average Age (years)
##  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41 
## 564 717 845 955 877 804 701 786 629 563 459 331 490 441 411 384 365 530 294 328 264 263 361 217 260 195 228 337 152 
##  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66 
## 246 198 205 245 208 154 218 166 172 143 131 136 131 190  98  93 111  95 141  55  92  41  34   6   3 
##     recoded
##      [0,15) [15,25) [25,35) [35,45) [45,55) [55,100) [100,1e+06)
##   13    564       0       0       0       0        0           0
##   14    717       0       0       0       0        0           0
##   15      0     845       0       0       0        0           0
##   16      0     955       0       0       0        0           0
##   17      0     877       0       0       0        0           0
##   18      0     804       0       0       0        0           0
##   19      0     701       0       0       0        0           0
##   20      0     786       0       0       0        0           0
##   21      0     629       0       0       0        0           0
##   22      0     563       0       0       0        0           0
##   23      0     459       0       0       0        0           0
##   24      0     331       0       0       0        0           0
##   25      0       0     490       0       0        0           0
##   26      0       0     441       0       0        0           0
##   27      0       0     411       0       0        0           0
##   28      0       0     384       0       0        0           0
##   29      0       0     365       0       0        0           0
##   30      0       0     530       0       0        0           0
##   31      0       0     294       0       0        0           0
##   32      0       0     328       0       0        0           0
##   33      0       0     264       0       0        0           0
##   34      0       0     263       0       0        0           0
##   35      0       0       0     361       0        0           0
##   36      0       0       0     217       0        0           0
##   37      0       0       0     260       0        0           0
##   38      0       0       0     195       0        0           0
##   39      0       0       0     228       0        0           0
##   40      0       0       0     337       0        0           0
##   41      0       0       0     152       0        0           0
##   42      0       0       0     246       0        0           0
##   43      0       0       0     198       0        0           0
##   44      0       0       0     205       0        0           0
##   45      0       0       0       0     245        0           0
##   46      0       0       0       0     208        0           0
##   47      0       0       0       0     154        0           0
##   48      0       0       0       0     218        0           0
##   49      0       0       0       0     166        0           0
##   50      0       0       0       0     172        0           0
##   51      0       0       0       0     143        0           0
##   52      0       0       0       0     131        0           0
##   53      0       0       0       0     136        0           0
##   54      0       0       0       0     131        0           0
##   55      0       0       0       0       0      190           0
##   56      0       0       0       0       0       98           0
##   57      0       0       0       0       0       93           0
##   58      0       0       0       0       0      111           0
##   59      0       0       0       0       0       95           0
##   60      0       0       0       0       0      141           0
##   61      0       0       0       0       0       55           0
##   62      0       0       0       0       0       92           0
##   63      0       0       0       0       0       41           0
##   64      0       0       0       0       0       34           0
##   65      0       0       0       0       0        6           0
##   66      0       0       0       0       0        3           0
## [1] "Frequency table after encoding"
## ager. Average Age (years)
## Less than 15        15-24        25-34        35-44        45-54 55 and older 
##         1281         6950         3770         2399         1704          959 
## [1] "Inspect value labels and relabel as necessary"
## Less than 15        15-24        25-34        35-44        45-54 55 and older           NA 
##            1            2            3            4            5            6            7
break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1, 
                "25-34" =2, 
                "35-44" =3, 
                "45-54" =4, 
                "55-64" =5, 
                "65 and older" =6, 
                "NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)

## [1] "Frequency table before encoding"
## IDR3_20. How old are you?
##    16    17    18    19    20    21    22    23    24    25    26    27    28    29    30    31    32    33    34 
##    18    44    60    62    63    72    68    72    78    60    49    41    35    53    59    50    56    54    53 
##    35    36    37    38    39    40    41    42    43    44    45    46    47    48    49    50    51    52    53 
##    52    36    35    37    51    34    42    28    41    44    31    36    34    37    34    32    28    31    29 
##    54    55    56    57    58    59    60    61    62    63    64    65    66    67    68    69  <NA> 
##    28    23    27    19    25    32    15    18    17    18    13    13     9    12     2     1 15052 
##     recoded
##      [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
##   16      18       0       0       0       0        0           0
##   17      44       0       0       0       0        0           0
##   18      60       0       0       0       0        0           0
##   19      62       0       0       0       0        0           0
##   20      63       0       0       0       0        0           0
##   21      72       0       0       0       0        0           0
##   22      68       0       0       0       0        0           0
##   23      72       0       0       0       0        0           0
##   24      78       0       0       0       0        0           0
##   25       0      60       0       0       0        0           0
##   26       0      49       0       0       0        0           0
##   27       0      41       0       0       0        0           0
##   28       0      35       0       0       0        0           0
##   29       0      53       0       0       0        0           0
##   30       0      59       0       0       0        0           0
##   31       0      50       0       0       0        0           0
##   32       0      56       0       0       0        0           0
##   33       0      54       0       0       0        0           0
##   34       0      53       0       0       0        0           0
##   35       0       0      52       0       0        0           0
##   36       0       0      36       0       0        0           0
##   37       0       0      35       0       0        0           0
##   38       0       0      37       0       0        0           0
##   39       0       0      51       0       0        0           0
##   40       0       0      34       0       0        0           0
##   41       0       0      42       0       0        0           0
##   42       0       0      28       0       0        0           0
##   43       0       0      41       0       0        0           0
##   44       0       0      44       0       0        0           0
##   45       0       0       0      31       0        0           0
##   46       0       0       0      36       0        0           0
##   47       0       0       0      34       0        0           0
##   48       0       0       0      37       0        0           0
##   49       0       0       0      34       0        0           0
##   50       0       0       0      32       0        0           0
##   51       0       0       0      28       0        0           0
##   52       0       0       0      31       0        0           0
##   53       0       0       0      29       0        0           0
##   54       0       0       0      28       0        0           0
##   55       0       0       0       0      23        0           0
##   56       0       0       0       0      27        0           0
##   57       0       0       0       0      19        0           0
##   58       0       0       0       0      25        0           0
##   59       0       0       0       0      32        0           0
##   60       0       0       0       0      15        0           0
##   61       0       0       0       0      18        0           0
##   62       0       0       0       0      17        0           0
##   63       0       0       0       0      18        0           0
##   64       0       0       0       0      13        0           0
##   65       0       0       0       0       0       13           0
##   66       0       0       0       0       0        9           0
##   67       0       0       0       0       0       12           0
##   68       0       0       0       0       0        2           0
##   69       0       0       0       0       0        1           0
## [1] "Frequency table after encoding"
## IDR3_20. How old are you?
##        15-24        25-34        35-44        45-54        55-64 65 and older         <NA> 
##          537          510          400          320          207           37        15052 
## [1] "Inspect value labels and relabel as necessary"
##        15-24        25-34        35-44        45-54        55-64 65 and older           NA 
##            1            2            3            4            5            6            7
# Remove variables derivedfrom ordinal variables that have been recoded

remove_ordinals <- c("age2", "ln_hhincome", "ln_exp")
mydata <- mydata[!names(mydata) %in% remove_ordinals]

# !!!Include relevant variables in list below

indirect_PII <- c("d6",
                  "d9",
                  "inc17",
                  "inc23",
                  "em16_2",
                  "cm29",
                  "me3",
                  "me5",
                  "me13",
                  "me14",
                  "ME_16",
                  "edu2",
                  "ageg",
                  "ageg2",
                  "incomecat",
                  "inc16r",
                  "d6r",
                  "ethnic",
                  "caste",
                  "me3r",
                  "me14r",
                  "me5r",
                  "me7r",
                  "ethnicity",
                  "education",
                  "childnum")

capture_tables (indirect_PII)

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("childnum", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more children
## [1] "Frequency table before encoding"
## childnum. Number of children
##    0    1    2    3    4    5    6    7    8    9   10   11 <NA> 
## 7247 1989 2942 2301 1205  679  372  132   52   12    7    6  119

## [1] "Frequency table after encoding"
## childnum. Number of children
##         0         1         2         3         4 5 or more      <NA> 
##      7247      1989      2942      2301      1205      1260       119

mydata <- top_recode ("d20", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more children
## [1] "Frequency table before encoding"
## d20. How many children do you have?
##    0    1    2    3    4    5    6    7    8    9   10   11  888 
## 7247 1989 2942 2301 1205  679  372  132   52   12    7    6  119

## [1] "Frequency table after encoding"
## d20. How many children do you have?
##         0         1         2         3         4 5 or more       888 
##      7247      1989      2942      2301      1205      1260       119

# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(mydata$inc17[mydata$inc17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## inc17. Approximately what was your household's cash income in the last month? (in NRS)
##       0      10      15      30      35      36     300     400     500     600     777     888     999    1000 
##     564       3       3       4       3       4       3       9      27       3     201       4    1020     106 
##    1200    1250    1300    1400    1500    1600    2000    2083    2100    2200    2300    2500    2600    3000 
##      38       4       3       3     118       8     382       3       3       3       4      73       3     570 
##    3500    3800    4000    4100    4500    5000    5500    5600    6000    6500    7000    7500    8000    8500 
##      23       3     535       4      33    1493       8       3     579       7     520      29     509       3 
##    9000    9500   10000   10400   10500   11000   12000   12200   12500   12800   13000   14000   15000   16000 
##     255       4    1709       3       7      41     671       4      23       3     179      94    1636     140 
##   17000   17200   18000   19000   20000   21000   22000   23000   24000   24400   24500   25000   26000   27000 
##      82       4     178      13    1420      28      75      20      31       4       3     816      26      12 
##   28000   29000   30000   31000   32000   33000   34500   35000   36000   38000   39000   40000   41000   41600 
##      27       9     751      11      21      18       4     254      17       3       3     342       2       3 
##   42000   43000   44000   45000   50000   52000   53000   55000   58000   59500   60000   65000   66000   70000 
##       7       9       3      83     414       3       4      27       8       3     170       4       3      44 
##   75000   80000   85000   90000   1e+05  108000  109000  110000  120000  125000  130000  150000  180000  190000 
##      16      56      13      21     115       3       3       4      13       8       3      50       3       3 
##   2e+05   3e+05  320000  350000   4e+05   5e+05  630000   7e+05 1500000   2e+06 
##      74       3       4       4       4       3       3       4       3       4

## [1] "Frequency table after encoding"
## inc17. Approximately what was your household's cash income in the last month? (in NRS)
##             0            10            15            30            35            36           300           400 
##           564             3             3             4             3             4             3             9 
##           500           600           777           888           999          1000          1200          1250 
##            27             3           201             4          1020           106            38             4 
##          1300          1400          1500          1600          2000          2083          2100          2200 
##             3             3           118             8           382             3             3             3 
##          2300          2500          2600          3000          3500          3800          4000          4100 
##             4            73             3           570            23             3           535             4 
##          4500          5000          5500          5600          6000          6500          7000          7500 
##            33          1493             8             3           579             7           520            29 
##          8000          8500          9000          9500         10000         10400         10500         11000 
##           509             3           255             4          1709             3             7            41 
##         12000         12200         12500         12800         13000         14000         15000         16000 
##           671             4            23             3           179            94          1636           140 
##         17000         17200         18000         19000         20000         21000         22000         23000 
##            82             4           178            13          1420            28            75            20 
##         24000         24400         24500         25000         26000         27000         28000         29000 
##            31             4             3           816            26            12            27             9 
##         30000         31000         32000         33000         34500         35000         36000         38000 
##           751            11            21            18             4           254            17             3 
##         39000         40000         41000         41600         42000         43000         44000         45000 
##             3           342             2             3             7             9             3            83 
##         50000         52000         53000         55000         58000         59500         60000         65000 
##           414             3             4            27             8             3           170             4 
##         66000         70000         75000         80000         85000         90000         1e+05        108000 
##             3            44            16            56            13            21           115             3 
##        109000        110000        120000        125000        130000        150000        180000        190000 
##             3             4            13             8             3            50             3             3 
## 2e+05 or more 
##           106

percentile_99.5 <- floor(quantile(mydata$inc23[mydata$inc23!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc23", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## inc23. In a typical month, what is your total household expenditure? (in NRS)
##     15    200    300    400    500    600    700    777    800    900    999   1000   1070   1100   1200   1300 
##      4      3     10      3     59      3      3     14      3      4    582    135      3      4     47      7 
##   1400   1500   1600   1800   2000   2200   2400   2500   2600   2800   3000   3500   4000   4009   4500   5000 
##      3    128      4      4    564     11      3    209      4      3   1061     97   1120      3     56   2461 
##   5500   6000   6500   7000   7500   8000   9000  10000  11000  12000  13000  13500  14000  15000  16000  17000 
##      6   1192      3    980     11    910    353   2615     22    629    157      4     55   1517     45     30 
##  18000  19000  20000  21000  22000  23000  24000  25000  27000  28000  30000  32000  32500  35000  40000  42000 
##     85      4    847     16     45     14     13    327     12      3    268      7      3     57     76      2 
##  45000  50000  60000  70000  80000  90000  1e+05 120000 140000  2e+05  9e+05 
##     13     56     30     10      7      4      7      3      3      3      4

## [1] "Frequency table after encoding"
## inc23. In a typical month, what is your total household expenditure? (in NRS)
##            15           200           300           400           500           600           700           777 
##             4             3            10             3            59             3             3            14 
##           800           900           999          1000          1070          1100          1200          1300 
##             3             4           582           135             3             4            47             7 
##          1400          1500          1600          1800          2000          2200          2400          2500 
##             3           128             4             4           564            11             3           209 
##          2600          2800          3000          3500          4000          4009          4500          5000 
##             4             3          1061            97          1120             3            56          2461 
##          5500          6000          6500          7000          7500          8000          9000         10000 
##             6          1192             3           980            11           910           353          2615 
##         11000         12000         13000         13500         14000         15000         16000         17000 
##            22           629           157             4            55          1517            45            30 
##         18000         19000         20000         21000         22000         23000         24000         25000 
##            85             4           847            16            45            14            13           327 
##         27000         28000         30000         32000         32500         35000         40000         42000 
##            12             3           268             7             3            57            76             2 
##         45000 50000 or more 
##            13           127

# Encode caste

mydata <- encode_location (variables= "D_3", missing=999999)
## [1] "Frequency table before encoding"
## D_3. What is your ethnic background?    [You do not need to read the response choices
##            chhetri     BRAHMAN (HILL)              magar              tharu             tamang              newar 
##               4334               2424               1017                654               3730               1361 
##             muslim               kami              yadav                rai             gurung       DAMAIN/DHOLI 
##                 23                636                 69                 44                177                315 
##              limbu            thakuri              sarki               teli CHAMAR/HARIJAN/RAM              koiri 
##                  8                258                235                 21                  7                260 
##              kurmi DUSADH/PASWAN/PASI              sonar    BRAHMAN (TARAI)      GHARTI/BHUJEL              malla 
##                  4                 30                 34                 30                168                  3 
##             kalwar              kumal       HAJAM/THAKUR            sunuwar              sudhi              lohar 
##                 38                117                  9                 10                  3                 15 
##              tatma             khatwe              majhi             nuniya             kumhar            danuwar 
##                  9                  9                 24                  8                 10                 13 
##      CHEPANG/PRAJA            haluwai             rajput           kayastha             badhae            marwadi 
##                319                  4                 17                 27                  3                 15 
##              thami              darai             pahari                dom               bote   ADIBASI/JANAJATI 
##                 61                 51                 34                  4                  4                  3 
##               badi        OTHER CASTE               <NA> 
##                 12                395                  7 
## [1] "Frequency table after encoding"
## D_3. What is your ethnic background?    [You do not need to read the response choices
##  384  385  386  387  388  389  390  391  392  393  394  395  396  397  398  399  400  401  402  403  404  405  406 
##  235    3    8    9 2424    3    9   38   27    4   10  654   10    7   24   15   21  319   13  177   34   34   30 
##  407  408  409  410  411  412  413  414  415  416  417  418  419  420  421  422  423  424  425  426  427  428  429 
##  260    4    3  258  636   12    9    8  117    3 1017   44   61 3730   69  315   30    4 1361   15 4334   51  168 
##  430  431  432  433 <NA> 
##   23   17  395    4    7
# Recode religion

break_rel <- c(1,2,3, 777, 888, 999)
labels_rel <- c("Hindu" = 1,
                "Buddhist" = 2,
                "Other" = 3,
                "Refused" = 4,
                "Not applicable" = 5,
                "Don't know" = 6)
mydata <- ordinal_recode (variable="d6", break_points=break_rel, missing=999, value_labels=labels_rel)

## [1] "Frequency table before encoding"
## d6. What is your religious background?    [You do not need to read the response choi
##          hindu       buddhist          islam         kirant      christian OTHER RELIGION           <NA> 
##          13305           3278             41              7            393             10             29 
##    recoded
##     [1,2) [2,3) [3,777) [777,888) [888,999) [999,1e+03)
##   1 13305     0       0         0         0           0
##   2     0  3278       0         0         0           0
##   3     0     0      41         0         0           0
##   4     0     0       7         0         0           0
##   6     0     0     393         0         0           0
##   9     0     0      10         0         0           0
## [1] "Frequency table after encoding"
## d6. What is your religious background?    [You do not need to read the response choi
##    Hindu Buddhist    Other     <NA> 
##    13305     3278      451       29 
## [1] "Inspect value labels and relabel as necessary"
##          Hindu       Buddhist          Other        Refused Not applicable     Don't know 
##              1              2              3              4              5              6
# Recode education into standard categories

break_edu <- c(0,6,9,11,12,13,17,18, 777, 888, 999)
labels_edu <- c("Primary or less (0-5)" = 1,
                "Lower secondary (6-8)" = 2,
                "Secondary (9-10)" = 3,
                "SLC (11)" = 4,
                "CLASS 12/Intermediate level (12)" = 5,
                "Bachelor/Postgraduate level" = 6,
                "Literate, but never attended school" = 7,
                "Illiterate, and never attended school"= 8,
                "Refused"= 9, 
                "Does not apply" = 10, 
                "Don't Know" = 11)
mydata <- ordinal_recode (variable="d4", break_points=break_edu, missing=999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## d4. What is your highest completed education level?<U+00A0>    [You do not need to read the
##               PRE-SCHOOL/KINDERGARTEN                               CLASS 1                               CLASS 2 
##                                    23                                   321                                   578 
##                               CLASS 3                               CLASS 4                               CLASS 5 
##                                   634                                   802                                  1180 
##                               CLASS 6                               CLASS 7                               CLASS 8 
##                                   925                                  1271                                  1401 
##                               CLASS 9                              CLASS 10                                   slc 
##                                  1023                                  1078                                  2577 
##           CLASS 12/INTERMEDIATE LEVEL                        BACHELOR LEVEL                          MASTER LEVEL 
##                                  1479                                   261                                    81 
##   LITERATE, BUT NEVER ATTENDED SCHOOL ILLITERATE, AND NEVER ATTENDED SCHOOL                                  <NA> 
##                                  1277                                  2104                                    48 
##     recoded
##      [0,6) [6,9) [9,11) [11,12) [12,13) [13,17) [17,18) [18,777) [777,888) [888,999) [999,1e+03)
##   0     23     0      0       0       0       0       0        0         0         0           0
##   1    321     0      0       0       0       0       0        0         0         0           0
##   2    578     0      0       0       0       0       0        0         0         0           0
##   3    634     0      0       0       0       0       0        0         0         0           0
##   4    802     0      0       0       0       0       0        0         0         0           0
##   5   1180     0      0       0       0       0       0        0         0         0           0
##   6      0   925      0       0       0       0       0        0         0         0           0
##   7      0  1271      0       0       0       0       0        0         0         0           0
##   8      0  1401      0       0       0       0       0        0         0         0           0
##   9      0     0   1023       0       0       0       0        0         0         0           0
##   10     0     0   1078       0       0       0       0        0         0         0           0
##   11     0     0      0    2577       0       0       0        0         0         0           0
##   12     0     0      0       0    1479       0       0        0         0         0           0
##   13     0     0      0       0       0     261       0        0         0         0           0
##   14     0     0      0       0       0      81       0        0         0         0           0
##   17     0     0      0       0       0       0    1277        0         0         0           0
##   18     0     0      0       0       0       0       0     2104         0         0           0
## [1] "Frequency table after encoding"
## d4. What is your highest completed education level?<U+00A0>    [You do not need to read the
##                 Primary or less (0-5)                 Lower secondary (6-8)                      Secondary (9-10) 
##                                  3538                                  3597                                  2101 
##                              SLC (11)      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                  2577                                  1479                                   342 
##   Literate, but never attended school Illiterate, and never attended school                                  <NA> 
##                                  1277                                  2104                                    48 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8)                      Secondary (9-10) 
##                                     1                                     2                                     3 
##                              SLC (11)      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     4                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school                               Refused 
##                                     7                                     8                                     9 
##                        Does not apply                            Don't Know 
##                                    10                                    11
# Recode into married vs "others"

break_mar <- c(1,2,1000)
labels_mar <- c("Married" =1, "Others" =2)
mydata <- ordinal_recode (variable="d9", break_points=break_mar, missing=999999, value_labels=labels_mar)

## [1] "Frequency table before encoding"
## d9. What is your marital status?
##            Married Separated/Divorced            Widowed      Never Married 
##              10418                100                281               6264 
##    recoded
##     [1,2) [2,1e+03) [1e+03,1e+06)
##   1 10418         0             0
##   2     0       100             0
##   3     0       281             0
##   4     0      6264             0
## [1] "Frequency table after encoding"
## d9. What is your marital status?
## Married  Others 
##   10418    6645 
## [1] "Inspect value labels and relabel as necessary"
## Married  Others 
##       1       2

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c("D_1", "edu2", "age") ##!!! Replace with candidate categorical demo vars

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial # No cases violate 2-anonimity. 
## The input dataset consists of 17063 rows and 410 variables.
##   --> Categorical key variables: D_1, edu2, age
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories     Mean size            Size of smallest (>0)       
##           D_1                    2 (2)  8531.500 (8531.500)                  8495 (8495)
##          edu2                    8 (8)  2430.714 (2430.714)                   342  (342)
##           age                    6 (6)  2843.833 (2843.833)                   959  (959)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 0 (0.000%)
##   - 3-anonymity: 0 (0.000%)
##   - 5-anonymity: 9 (0.053%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("HTV_1_10_TEXT", "HTV_3_11_TEXTx3", "HTV_3_11_TEXTx3_Translation")
report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata <- mydata[!names(mydata) %in% "HTV_1_10_TEXT"]
mydata <- mydata[!names(mydata) %in% "HTV_3_11_TEXTx3"]

GPS data: Displace

# !!!No GPS data

Drop duplicate variables to avoid problems when opening in SPSS

mydata <- mydata[!names(mydata) %in% "htv_1_1r"]

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))