rm(list=ls(all=t))

Setup and crate dictionary

filename <- "Nepal Round 3_FinalClean_Labor" # !!!Update filename
source ("functions_1.7.R")
## --------
## This is sdcMicro v5.6.0.
## For references, please have a look at citation('sdcMicro')
## Note: since version 5.0.0, the graphical user-interface is a shiny-app that can be started with sdcApp().
## Please submit suggestions and bugs at: https://github.com/sdcTools/sdcMicro/issues
## --------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: sp
## Checking rgeos availability: TRUE
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:sdcMicro':
## 
##     freq
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/C_Pablo_Diego-Rosell/Documents/R/R-3.6.3/library/rgdal/proj
## Loading required package: spatstat.data
## Loading required package: spatstat.geom
## spatstat.geom 2.0-1
## 
## Attaching package: 'spatstat.geom'
## The following objects are masked from 'package:raster':
## 
##     area, rotate, shift
## Loading required package: spatstat.core
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:raster':
## 
##     getData
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: rpart
## spatstat.core 2.0-0
## Loading required package: spatstat.linnet
## spatstat.linnet 2.1-1
## 
## spatstat 2.0-1       (nickname: 'Caution: contains small parts') 
## For an introduction to spatstat, type 'beginner'
## rgeos version: 0.5-5, (SVN revision 640)
##  GEOS runtime version: 3.8.0-CAPI-1.13.1 
##  Linking to sp version: 1.4-4 
##  Polygon checking: TRUE
## 
## Spatial Point Pattern Analysis Code in S-Plus
##  
##  Version 2 - Spatial and Space-Time analysis
## 
## Attaching package: 'splancs'
## The following object is masked from 'package:raster':
## 
##     zoom
## The following object is masked from 'package:dplyr':
## 
##     tribble
## Loading required package: spam
## Loading required package: dotCall64
## Loading required package: grid
## Spam version 2.6-0 (2020-12-14) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## See https://github.com/NCAR/Fields for
##  an extensive vignette, other supplements and source code
## 
## Attaching package: 'geosphere'
## The following object is masked from 'package:spatstat.geom':
## 
##     perimeter
## 
## Attaching package: 'tibble'
## The following object is masked from 'package:splancs':
## 
##     tribble

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables

Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("RvwName", "IDR3_18", "IDR3_19", "LE_reportedby", "flag_reportedby") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects

!!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables=c("Srvyr", "surveyor"))
## [1] "Frequency table before encoding"
## Srvyr. Srvyr
##       alka.adhikari    ambir.raj.kulung         amrita.roka anjana.kumari.dulal 
##                  79                  96                  89                  98 
##     ashish.shrestha bhanu.bhakta.dhakal       dev.raj.nepal dhan.kumari.darlami 
##                  82                  77                   2                  85 
##       gita.maharjan       kamala.sharma        manjula.giri min.kumari.shrestha 
##                  99                  78                  99                  86 
##       nabina.khadka      niraj.shrestha    pramila.shrestha    pratika.shrestha 
##                  80                  85                  77                  85 
## rabischandra.bhatta   ram.kumar.acharya     sajina.shrestha     sandip.shrestha 
##                  88                  88                  73                  96 
##       sapana.gautam     sarita.shrestha     tirtha.maya.rai        yamuna.karki 
##                  80                  99                 104                  86 
## [1] "Frequency table after encoding"
## Srvyr. Srvyr
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21 
##  79  96  89  98  82  77   2  85  99  78  99  86  80  85  77  85  88  88  73  96  80 
##  22  23  24 
##  99 104  86 
## [1] "Frequency table before encoding"
## surveyor. Surveyor
##                           alka.adhikari    ambir.raj.kulung         amrita.roka 
##                   1                  79                  96                  89 
## anjana.kumari.dulal     ashish.shrestha bhanu.bhakta.dhakal       dev.raj.nepal 
##                  98                  82                  77                   2 
## dhan.kumari.darlami       gita.maharjan       kamala.sharma        manjula.giri 
##                  85                  99                  78                  99 
## min.kumari.shrestha       nabina.khadka      niraj.shrestha    pramila.shrestha 
##                  86                  80                  85                  77 
##    pratika.shrestha rabischandra.bhatta   ram.kumar.acharya     sajina.shrestha 
##                  85                  87                  88                  73 
##     sandip.shrestha       sapana.gautam     sarita.shrestha     tirtha.maya.rai 
##                  96                  80                  99                 104 
##        yamuna.karki 
##                  86 
## [1] "Frequency table after encoding"
## surveyor. Surveyor
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21 
##   1  79  96  89  98  82  77   2  85  99  78  99  86  80  85  77  85  87  88  73  96 
##  22  23  24  25 
##  80  99 104  86

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("vdc", "IDR3_6_19","IDR3_6_22","IDR3_6_23","IDR3_6_24","IDR3_6_26","IDR3_6_30","IDR3_6_31","IDR3_6_35", "IDR3_7") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## vdc. VDC code
##          Barahathawa         Dhungrekhola             Lalbandi         Malangawa NP 
##                   38                   29                   61                   27 
##            Netraganj             Raniganj            Sankarpur         Bhimeswor NP 
##                   34                   37                   28                   32 
##                Bocha          Dandakharka                Fasku             Katakuti 
##                   17                   32                   31                   31 
##            Lamidanda               Melung               Pawati              Badegau 
##                   29                   34                   27                   24 
##                Irkhu         BhoteNamlang           Talamarang                Ichok 
##                   33                   32                   28                   32 
##             Kadambas            Langarche             Melamchi              Anaikot 
##                   25                   32                   34                   25 
##    Baluwapati Deupur    Chalal Ganeshthan    Kalati Bhumidanda       Mahankal Chaur 
##                   21                   41                   41                   26 
##            Methinkot           Patalekhet              Raviopi               Balkot 
##                   35                   25                   42                   22 
##        Changunarayan             Chitapol              Duwakot                Gundu 
##                   35                   34                   27                   33 
##   Madhyapur Thimi NP              Nankhel              Sirutar              Baireni 
##                   29                   33                   36                   36 
##               Dhussa                Khari            Kiranchok              Naubise 
##                   31                   32                   34                   27 
##            Salyantar        Sunaula Bazar               Thakre             Chitlang 
##                   35                   33                   24                   27 
##           Churiyamai               Fakhel         Padampokhari            Kulekhani 
##                   28                   30                   39                   37 
##            Nibuwatar   Shreepur Chhatiwan Sisneri Mahadevsthan        Birendranagar 
##                   32                   37                   29                   18 
##              Jutpani               Kathar           Khairahani             Padampur 
##                   39                   37                   32                   38 
##           Parbatipur                Piple           Shaktikhor 
##                   31                   37                   36 
## [1] "Frequency table after encoding"
## vdc. VDC code
## 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 
##  28  32  34  28  36  38  31  28  27  32  25  39  41  33  22  37  33  27  29  25  31 
## 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 
##  21  29  41  37  34  27  34  32  27  34  34  32  37  27  36  24  42  31  30  39  33 
## 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 
##  32  37  35  25  32  32  32  31  29  36  18  26  35  35  32  38  37  17  33  24  29 
## 530 
##  29 
## [1] "Frequency table before encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
##  Barahathawa Dhungrekhola    Dhurkauli     Lalbandi Malangawa NP    Netraganj 
##           38           29           29           32           27           34 
##     Raniganj    Sankarpur       999999 
##           37           28         1757 
## [1] "Frequency table after encoding"
## IDR3_6_19. VDC or Municaplity of District Sarlahi
##    904    905    906    907    908    909    910    912 999999 
##     34     27     37     28     29     38     29     32   1757 
## [1] "Frequency table before encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
## Bhimeswor NP        Bocha  Dandakharka        Fasku     Katakuti    Lamidanda 
##           32           17           32           31           31           29 
##       Melung       Pawati       999999 
##           34           27         1778 
## [1] "Frequency table after encoding"
## IDR3_6_22. VDC or Municaplity of District Dolakha
##    876    877    878    879    880    881    883    884 999999 
##     27     31     29     34     32     32     17     31   1778 
## [1] "Frequency table before encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
##      Badegau        Irkhu BhoteNamlang   Talamarang        Ichok     Kadambas 
##           24           33           32           28           32           25 
##    Langarche     Melamchi       999999 
##           32           34         1771 
## [1] "Frequency table after encoding"
## IDR3_6_23. VDC or Municaplity of District Sindhupalchok
##    513    514    515    516    517    518    520    521 999999 
##     33     34     32     32     25     32     28     24   1771 
## [1] "Frequency table before encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
##           Anaikot Baluwapati Deupur Chalal Ganeshthan Kalati Bhumidanda 
##                25                21                41                41 
##    Mahankal Chaur         Methinkot        Patalekhet           Raviopi 
##                26                35                25                42 
##            999999 
##              1755 
## [1] "Frequency table after encoding"
## IDR3_6_24. VDC or Municaplity of District Kavrepalanchok
##    689    690    691    692    693    694    695    697 999999 
##     21     42     25     41     25     35     26     41   1755 
## [1] "Frequency table before encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
##             Balkot      Changunarayan           Chitapol            Duwakot 
##                 22                 35                 34                 27 
##              Gundu Madhyapur Thimi NP            Nankhel            Sirutar 
##                 33                 29                 33                 36 
##             999999 
##               1762 
## [1] "Frequency table after encoding"
## IDR3_6_26. VDC or Municaplity of District Bhaktapur
##    405    406    407    408    410    411    412    413 999999 
##     34     35     33     22     33     27     36     29   1762 
## [1] "Frequency table before encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
##       Baireni        Dhussa         Khari     Kiranchok       Naubise     Salyantar 
##            36            31            32            34            27            35 
## Sunaula Bazar        Thakre        999999 
##            33            24          1759 
## [1] "Frequency table after encoding"
## IDR3_6_30. VDC or Municaplity of District Dhading
##    634    635    636    637    639    640    641    642 999999 
##     32     34     36     31     33     35     27     24   1759 
## [1] "Frequency table before encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
##             Chitlang           Churiyamai               Fakhel         Padampokhari 
##                   27                   28                   30                   39 
##            Kulekhani            Nibuwatar   Shreepur Chhatiwan Sisneri Mahadevsthan 
##                   37                   32                   37                   29 
##               999999 
##                 1752 
## [1] "Frequency table after encoding"
## IDR3_6_31. VDC or Municaplity of District Makwanpur
##    798    799    800    801    802    803    804    805 999999 
##     29     30     28     37     37     39     32     27   1752 
## [1] "Frequency table before encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
## Birendranagar       Jutpani        Kathar    Khairahani      Padampur    Parbatipur 
##            18            39            37            32            38            31 
##         Piple    Shaktikhor        999999 
##            37            36          1743 
## [1] "Frequency table after encoding"
## IDR3_6_35. VDC or Municaplity of District Chitwan
##    875    876    877    878    879    880    881    882 999999 
##     39     37     18     31     37     38     32     36   1743 
## [1] "Frequency table before encoding"
## IDR3_7. Ward Number
##   1   2   3   4   5   6   7   8   9  10  11  12  14 
## 263 246 169 237 162 290 209 155 218  11  19  22  10 
## [1] "Frequency table after encoding"
## IDR3_7. Ward Number
## 533 534 535 536 537 538 539 540 541 542 543 544 545 
## 209  19 169 237 246 290 218 263 155 162  10  22  11

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" of 10 or less. 

break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1, 
                "25-34" =2, 
                "35-44" =3, 
                "45-54" =4, 
                "55-64" =5, 
                "65 and older" =6, 
                "NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)

## [1] "Frequency table before encoding"
## IDR3_20. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53 52 36 35 37 51 34 42 28 41 
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
## 44 31 36 34 37 34 32 28 31 29 28 23 27 19 25 32 15 18 17 18 13 13  9 12  2  1 
##     recoded
##      [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
##   16      18       0       0       0       0        0           0
##   17      44       0       0       0       0        0           0
##   18      60       0       0       0       0        0           0
##   19      62       0       0       0       0        0           0
##   20      63       0       0       0       0        0           0
##   21      72       0       0       0       0        0           0
##   22      68       0       0       0       0        0           0
##   23      72       0       0       0       0        0           0
##   24      78       0       0       0       0        0           0
##   25       0      60       0       0       0        0           0
##   26       0      49       0       0       0        0           0
##   27       0      41       0       0       0        0           0
##   28       0      35       0       0       0        0           0
##   29       0      53       0       0       0        0           0
##   30       0      59       0       0       0        0           0
##   31       0      50       0       0       0        0           0
##   32       0      56       0       0       0        0           0
##   33       0      54       0       0       0        0           0
##   34       0      53       0       0       0        0           0
##   35       0       0      52       0       0        0           0
##   36       0       0      36       0       0        0           0
##   37       0       0      35       0       0        0           0
##   38       0       0      37       0       0        0           0
##   39       0       0      51       0       0        0           0
##   40       0       0      34       0       0        0           0
##   41       0       0      42       0       0        0           0
##   42       0       0      28       0       0        0           0
##   43       0       0      41       0       0        0           0
##   44       0       0      44       0       0        0           0
##   45       0       0       0      31       0        0           0
##   46       0       0       0      36       0        0           0
##   47       0       0       0      34       0        0           0
##   48       0       0       0      37       0        0           0
##   49       0       0       0      34       0        0           0
##   50       0       0       0      32       0        0           0
##   51       0       0       0      28       0        0           0
##   52       0       0       0      31       0        0           0
##   53       0       0       0      29       0        0           0
##   54       0       0       0      28       0        0           0
##   55       0       0       0       0      23        0           0
##   56       0       0       0       0      27        0           0
##   57       0       0       0       0      19        0           0
##   58       0       0       0       0      25        0           0
##   59       0       0       0       0      32        0           0
##   60       0       0       0       0      15        0           0
##   61       0       0       0       0      18        0           0
##   62       0       0       0       0      17        0           0
##   63       0       0       0       0      18        0           0
##   64       0       0       0       0      13        0           0
##   65       0       0       0       0       0       13           0
##   66       0       0       0       0       0        9           0
##   67       0       0       0       0       0       12           0
##   68       0       0       0       0       0        2           0
##   69       0       0       0       0       0        1           0
## [1] "Frequency table after encoding"
## IDR3_20. How old are you?
##        15-24        25-34        35-44        45-54        55-64 65 and older 
##          537          510          400          320          207           37 
## [1] "Inspect value labels and relabel as necessary"
##        15-24        25-34        35-44        45-54        55-64 65 and older 
##            1            2            3            4            5            6 
##           NA 
##            7
mydata <- ordinal_recode (variable="age", break_points=break_age, missing=999999, value_labels=labels_age)

## [1] "Frequency table before encoding"
## age. How old are you?
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 
## 18 44 60 62 63 72 68 72 78 60 49 41 35 53 59 50 56 54 53 52 36 35 37 51 34 42 28 41 
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
## 44 31 36 34 37 34 32 28 31 29 28 23 27 19 25 32 15 18 17 18 13 13  9 12  2  1 
##     recoded
##      [15,25) [25,35) [35,45) [45,55) [55,65) [65,100) [100,1e+06)
##   16      18       0       0       0       0        0           0
##   17      44       0       0       0       0        0           0
##   18      60       0       0       0       0        0           0
##   19      62       0       0       0       0        0           0
##   20      63       0       0       0       0        0           0
##   21      72       0       0       0       0        0           0
##   22      68       0       0       0       0        0           0
##   23      72       0       0       0       0        0           0
##   24      78       0       0       0       0        0           0
##   25       0      60       0       0       0        0           0
##   26       0      49       0       0       0        0           0
##   27       0      41       0       0       0        0           0
##   28       0      35       0       0       0        0           0
##   29       0      53       0       0       0        0           0
##   30       0      59       0       0       0        0           0
##   31       0      50       0       0       0        0           0
##   32       0      56       0       0       0        0           0
##   33       0      54       0       0       0        0           0
##   34       0      53       0       0       0        0           0
##   35       0       0      52       0       0        0           0
##   36       0       0      36       0       0        0           0
##   37       0       0      35       0       0        0           0
##   38       0       0      37       0       0        0           0
##   39       0       0      51       0       0        0           0
##   40       0       0      34       0       0        0           0
##   41       0       0      42       0       0        0           0
##   42       0       0      28       0       0        0           0
##   43       0       0      41       0       0        0           0
##   44       0       0      44       0       0        0           0
##   45       0       0       0      31       0        0           0
##   46       0       0       0      36       0        0           0
##   47       0       0       0      34       0        0           0
##   48       0       0       0      37       0        0           0
##   49       0       0       0      34       0        0           0
##   50       0       0       0      32       0        0           0
##   51       0       0       0      28       0        0           0
##   52       0       0       0      31       0        0           0
##   53       0       0       0      29       0        0           0
##   54       0       0       0      28       0        0           0
##   55       0       0       0       0      23        0           0
##   56       0       0       0       0      27        0           0
##   57       0       0       0       0      19        0           0
##   58       0       0       0       0      25        0           0
##   59       0       0       0       0      32        0           0
##   60       0       0       0       0      15        0           0
##   61       0       0       0       0      18        0           0
##   62       0       0       0       0      17        0           0
##   63       0       0       0       0      18        0           0
##   64       0       0       0       0      13        0           0
##   65       0       0       0       0       0       13           0
##   66       0       0       0       0       0        9           0
##   67       0       0       0       0       0       12           0
##   68       0       0       0       0       0        2           0
##   69       0       0       0       0       0        1           0
## [1] "Frequency table after encoding"
## age. How old are you?
##        15-24        25-34        35-44        45-54        55-64 65 and older 
##          537          510          400          320          207           37 
## [1] "Inspect value labels and relabel as necessary"
##        15-24        25-34        35-44        45-54        55-64 65 and older 
##            1            2            3            4            5            6 
##           NA 
##            7
# !!!Include relevant variables in list below

indirect_PII <- c("D_9", 
                  "HC2_O1", 
                  "HC2_O2", 
                  "HC2_O3", 
                  "HC2_O4", 
                  "HC2_O5", 
                  "HC2_O6", 
                  "H2_12_TEXT", 
                  "HC3", 
                  "HC4_1", 
                  "HC4_2", 
                  "HC4_3", 
                  "HC4_4", 
                  "D_4", 
                  "Inc_17", 
                  "con1A_gender_I1", 
                  "con1A_age_I1", 
                  "con1A_caste_I1", 
                  "con1A_crime_I1", 
                  "con1A_income_I1", 
                  "con1A_educ_I1", 
                  "con1B_gender_I1", 
                  "con1B_age_I1", 
                  "con1B_caste_I1", 
                  "con1B_crime_I1", 
                  "con1B_income_I1", 
                  "con1B_educ_I1", 
                  "T_233_1_I1", 
                  "con1A_gender_I2", 
                  "con1A_age_I2", 
                  "con1A_caste_I2", 
                  "con1A_income_I2", 
                  "con1A_educ_I2", 
                  "con1B_gender_I2", 
                  "con1B_age_I2", 
                  "con1B_caste_I2", 
                  "con1B_income_I2", 
                  "con1B_educ_I2", 
                  "con1A_gender_I3", 
                  "con1A_age_I3", 
                  "con1A_caste_I3", 
                  "con1A_income_I3", 
                  "con1A_educ_I3", 
                  "con1B_gender_I3", 
                  "con1B_age_I3", 
                  "con1B_caste_I3", 
                  "con1B_income_I3", 
                  "con1B_educ_I3", 
                  "con2A_gender_I1", 
                  "con2A_age_I1", 
                  "con2A_caste_I1", 
                  "con2A_income_I1", 
                  "con2A_educ_I1", 
                  "con2B_gender_I1", 
                  "con2B_age_I1", 
                  "con2B_caste_I1", 
                  "con2B_income_I1", 
                  "con2B_educ_I1", 
                  "con2A_gender_I2", 
                  "con2A_age_I2", 
                  "con2A_caste_I2", 
                  "con2A_income_I2", 
                  "con2A_educ_I2", 
                  "con2B_gender_I2", 
                  "con2B_age_I2", 
                  "con2B_caste_I2", 
                  "con2B_income_I2", 
                  "con2B_educ_I2", 
                  "con2A_gender_I3", 
                  "con2A_age_I3", 
                  "con2A_caste_I3", 
                  "con2A_income_I3", 
                  "con2A_educ_I3", 
                  "con2B_gender_I3", 
                  "con2B_age_I3", 
                  "con2B_caste_I3", 
                  "con2B_income_I3", 
                  "con2B_educ_I3", 
                  "P1", 
                  "P1A", 
                  "P2", 
                  "P2A", 
                  "P3", 
                  "P3A", 
                  "P4", 
                  "P4A", 
                  "P8_O1", 
                  "P8_O2", 
                  "P8_O3", 
                  "P8_3_number", 
                  "P8_4_number", 
                  "P8_5_number", 
                  "P12A", 
                  "P12A_TEXT", 
                  "P13A_O1", 
                  "P13A_O2", 
                  "P13A_10_TEXT", 
                  "P9B", 
                  "P10B", 
                  "P12B", 
                  "P13B_O1", 
                  "P13B_O2", 
                  "P13B_10_TEXT", 
                  "P9C_I1", 
                  "P10C_I1", 
                  "P11C_I1", 
                  "P11_A3_I1", 
                  "P12C_I1", 
                  "P12C_TEXT_I1", 
                  "P13C_O1_I1", 
                  "P13C_10_TEXT_I1", 
                  "P9C_I2", 
                  "P10C_I2", 
                  "P11C_I2", 
                  "P11_A3_I2", 
                  "P12C_I2", 
                  "P9D_I1", 
                  "P10D_I1", 
                  "P11D_I1", 
                  "P11_A4_I1", 
                  "P12D_I1", 
                  "P13D_O1_I1", 
                  "P13D_10_TEXT_I1", 
                  "P9D_I2", 
                  "P10D_I2", 
                  "P11D_I2", 
                  "P11_A4_I2", 
                  "P12D_I2", 
                  "P13D_O1_I2", 
                  "P13D_O2_I2", 
                  "P9E_I1", 
                  "P10E_I1", 
                  "P11E_I1", 
                  "P11_A5_I1", 
                  "P12E_I1", 
                  "P13E_O1_I1", 
                  "P13E_O2_I1", 
                  "P9E_I2", 
                  "P10E_I2", 
                  "P11E_I2", 
                  "P11_A5_I2", 
                  "P12E_I2", 
                  "P13E_O1_I2", 
                  "P14E_O1_I2", 
                  "P9E_I3", 
                  "P10E_I3", 
                  "P11E_I3", 
                  "P11_A5_I3", 
                  "P12E_I3", 
                  "P13E_O1_I3", 
                  "P14E_O1_I3", 
                  "P20A", 
                  "P19B", 
                  "P18C_I1", 
                  "P19C_I1", 
                  "P20C_I1", 
                  "P18C_I2", 
                  "P19C_I2", 
                  "P20C_I2", 
                  "P18D_I1", 
                  "P19D_I1", 
                  "P20D_I1", 
                  "P18D_I2", 
                  "P19D_I2", 
                  "P20D_I2", 
                  "P18E_I1", 
                  "P19E_I1", 
                  "P20E_I1", 
                  "NEW_2", 
                  "NEW_2_cl_I1", 
                  "P19_cl_I1", 
                  "D_9_cl_I1", 
                  "D_4_cl_I1", 
                  "NEW_2_cl_I2", 
                  "P19_cl_I2", 
                  "D_9_cl_I2", 
                  "D_4_cl_I2", 
                  "NEW_2_cl_I3", 
                  "P19_cl_I3", 
                  "D_9_cl_I3", 
                  "D_4_cl_I3", 
                  "NEW_2_cl_I4", 
                  "P19_cl_I4", 
                  "D_9_cl_I4", 
                  "D_4_cl_I4", 
                  "NEW_2_cl_I5", 
                  "P19_cl_I5", 
                  "D_9_cl_I5", 
                  "D_4_cl_I5", 
                  "NEW_2_cl_I6", 
                  "P19_cl_I6", 
                  "D_4_cl_I6", 
                  "D_8_cl_I6", 
                  "E2_2", 
                  "child_int", 
                  "forcedmarriage", 
                  "FM_self", 
                  "FM_spouse", 
                  "FM_child", 
                  "FM_childnum", 
                  "FM_parent", 
                  "FM_parentnum", 
                  "FM_sib", 
                  "FM_sibnum", 
                  "FM_self_aschild", 
                  "FM_spouse_aschild", 
                  "FM_child_aschild1", 
                  "FM_child_aschild2", 
                  "FM_parent_aschild1", 
                  "FM_parent_aschild2", 
                  "FM_sib_aschild1", 
                  "income", 
                  "incq", 
                  "noeduc", 
                  "rchild_int1", 
                  "rchild_int2", 
                  "rchild_int3", 
                  "rchild_int4", 
                  "rchild_int5", 
                  "rchild_int6", 
                  "age_rc1", 
                  "age_rc2", 
                  "age_rc3", 
                  "age_rc4", 
                  "age_rc5", 
                  "age_rc6", 
                  "menace_rc1", 
                  "menace_rc2", 
                  "noeduc_rc", 
                  "allchild_int_hh", 
                  "rchild_int_hh", 
                  "agegroup_rc1", 
                  "agegroup_rc2", 
                  "agegroup_rc3", 
                  "fiveyears_rcnum", 
                  "incqb_rc1", 
                  "incqb_rc2", 
                  "incqb_rc3", 
                  "incqb_rc4", 
                  "incqb_rc5", 
                  "incqb_rc6", 
                  "incqb_rcnum", 
                  "hhnoeduc_rc1", 
                  "hhnoeduc_rc2", 
                  "hhnoeduc_rc3", 
                  "hhnoeduc_rc4", 
                  "hhnoeduc_rc5", 
                  "hhnoeduc_rcnum")

capture_tables (indirect_PII)

# Recode those with very specific values where more than half of the sample have actual data. 

mydata <- mydata[!names(mydata) %in% "H2_12_TEXT"] # Drop as actually verbatim data in Nepali

# P3 - Number of siblings, topcode cases with 10 or more than 10 siblings. 

mydata2 <- encode_direct_PII_team (variables="E2_2") # Encode as low frequencies on languages. 
## [1] "Frequency table before encoding"
## E2_2. What language did you use other than Nepali?
##  NEWAR 999999 
##      6   2005 
## [1] "Frequency table after encoding"
## E2_2. What language did you use other than Nepali?
##    1    2 
##    6 2005
mydata <- top_recode ("HC3", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members. 
## [1] "Frequency table before encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
##   0   1   2   3   4   5   6   7   8   9  13 888 
## 751 542 470 168  52  16   4   1   1   1   2   3

## [1] "Frequency table after encoding"
## HC3. How many people living in your household are at least 15 years old (have complet
##         0         1         2         3         4 5 or more       888 
##       751       542       470       168        52        25         3

# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
##      0      5     50     60    400    500    600    700    777    888    999   1000 
##    132      1      1      1      1      5      2      1      4      1     22      3 
##   1200   1500   1600   2000   2400   2500   2600   3000   3500   4000   4500   4800 
##      1      4      5     26      2      8      1     33      2     42      3      1 
##   5000   6000   6500   7000   8000   8500   9000  10000  11000  12000  12846  13000 
##     99     34      1     38     20      1     24    193      3     54      1     11 
##  14000  14500  15000  15500  16000  17000  18000  19000  19135  20000  21000  22000 
##     10      1    174      1     23      8     15      6      1    220      2     12 
##  22500  23000  24000  25000  26000  27000  27500  28000  30000  32000  33000  34000 
##      1      4      5    105      1      5      1      2    177      2      1      1 
##  35000  36000  37000  40000  41000  45000  48000  50000  54000  55000  57000  60000 
##     54      4      1     88      3     17      1    113      2      4      2     48 
##  62000  65000  66000  67000  68000  70000  75000  79000  79500  80000  85000  95000 
##      1      7      1      1      1     15      4      1      1     13      1      1 
##  1e+05 103000 104000 110000 115000 117000 125000 130000 135000 150000 160000 170000 
##     27      1      1      1      2      1      1      1      1     11      1      1 
##  2e+05 240000 250000  3e+05 320000  5e+05  6e+05  7e+05  1e+06 
##      9      1      1      3      1      1      1      1      1

## [1] "Frequency table after encoding"
## Inc_17. Approximately what was your household's cash income in the last month? (in NRS).
##             0             5            50            60           400           500 
##           132             1             1             1             1             5 
##           600           700           777           888           999          1000 
##             2             1             4             1            22             3 
##          1200          1500          1600          2000          2400          2500 
##             1             4             5            26             2             8 
##          2600          3000          3500          4000          4500          4800 
##             1            33             2            42             3             1 
##          5000          6000          6500          7000          8000          8500 
##            99            34             1            38            20             1 
##          9000         10000         11000         12000         12846         13000 
##            24           193             3            54             1            11 
##         14000         14500         15000         15500         16000         17000 
##            10             1           174             1            23             8 
##         18000         19000         19135         20000         21000         22000 
##            15             6             1           220             2            12 
##         22500         23000         24000         25000         26000         27000 
##             1             4             5           105             1             5 
##         27500         28000         30000         32000         33000         34000 
##             1             2           177             2             1             1 
##         35000         36000         37000         40000         41000         45000 
##            54             4             1            88             3            17 
##         48000         50000         54000         55000         57000         60000 
##             1           113             2             4             2            48 
##         62000         65000         66000         67000         68000         70000 
##             1             7             1             1             1            15 
##         75000         79000         79500         80000         85000         95000 
##             4             1             1            13             1             1 
##         1e+05        103000        104000        110000        115000        117000 
##            27             1             1             1             2             1 
##        125000        130000        135000        150000        160000        170000 
##             1             1             1            11             1             1 
## 2e+05 or more 
##            19

#percentile_99.5 <- floor(quantile(mydata$income[mydata$income!=999999], probs = c(0.995), na.rm=TRUE))
mydata2 <- top_recode (variable="income", break_point=percentile_99.5, missing=999999)
## [1] "Frequency table before encoding"
## income. 
##      0      5     50     60    400    500    600    700   1000   1200   1500   1600 
##    132      1      1      1      1      5      2      1      3      1      4      5 
##   2000   2400   2500   2600   3000   3500   4000   4500   4800   5000   6000   6500 
##     26      2      8      1     33      2     42      3      1     99     34      1 
##   7000   8000   8500   9000  10000  11000  12000  12846  13000  14000  14500  15000 
##     38     20      1     24    193      3     54      1     11     10      1    174 
##  15500  16000  17000  18000  19000  19135  20000  21000  22000  22500  23000  24000 
##      1     23      8     15      6      1    220      2     12      1      4      5 
##  25000  26000  27000  27500  28000  30000  32000  33000  34000  35000  36000  37000 
##    105      1      5      1      2    177      2      1      1     54      4      1 
##  40000  41000  45000  48000  50000  54000  55000  57000  60000  62000  65000  66000 
##     88      3     17      1    113      2      4      2     48      1      7      1 
##  67000  68000  70000  75000  79000  79500  80000  85000  95000  1e+05 103000 104000 
##      1      1     15      4      1      1     13      1      1     27      1      1 
## 110000 115000 117000 125000 130000 135000 150000 160000 170000  2e+05 240000 250000 
##      1      2      1      1      1      1     11      1      1      9      1      1 
##  3e+05 320000  5e+05  6e+05  7e+05  1e+06   <NA> 
##      3      1      1      1      1      1     27

## [1] "Frequency table after encoding"
## income. 2e+05
##             0             5            50            60           400           500 
##           132             1             1             1             1             5 
##           600           700          1000          1200          1500          1600 
##             2             1             3             1             4             5 
##          2000          2400          2500          2600          3000          3500 
##            26             2             8             1            33             2 
##          4000          4500          4800          5000          6000          6500 
##            42             3             1            99            34             1 
##          7000          8000          8500          9000         10000         11000 
##            38            20             1            24           193             3 
##         12000         12846         13000         14000         14500         15000 
##            54             1            11            10             1           174 
##         15500         16000         17000         18000         19000         19135 
##             1            23             8            15             6             1 
##         20000         21000         22000         22500         23000         24000 
##           220             2            12             1             4             5 
##         25000         26000         27000         27500         28000         30000 
##           105             1             5             1             2           177 
##         32000         33000         34000         35000         36000         37000 
##             2             1             1            54             4             1 
##         40000         41000         45000         48000         50000         54000 
##            88             3            17             1           113             2 
##         55000         57000         60000         62000         65000         66000 
##             4             2            48             1             7             1 
##         67000         68000         70000         75000         79000         79500 
##             1             1            15             4             1             1 
##         80000         85000         95000         1e+05        103000        104000 
##            13             1             1            27             1             1 
##        110000        115000        117000        125000        130000        135000 
##             1             2             1             1             1             1 
##        150000        160000        170000 2e+05 or more          <NA> 
##            11             1             1            19            27

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_4', 'IDR3_20', 'con1A_gender_I1') ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 2011 rows and 1400 variables.
##   --> Categorical key variables: D_4, IDR3_20, con1A_gender_I1
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##     Key Variable Number of categories      Mean size            Size of smallest (>0)
##              D_4                   17 (17)   118.294  (118.294)                     1
##          IDR3_20                    6  (6)   335.167  (335.167)                    37
##  con1A_gender_I1                    2  (2)  1005.500 (1005.500)                   997
##       
##    (1)
##   (37)
##  (997)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 25 (1.243%)
##   - 3-anonymity: 57 (2.834%)
##   - 5-anonymity: 138 (6.862%)
## 
## ----------------------------------------------------------------------
# Recode of education and age to reduce risk of re-identification 

break_edu <- c(0,6,9,11,12,13,15,16,777,888,999)
labels_edu <- c("Primary or less (0-5)" = 1,
                "Lower secondary (6-8)" = 2,
                "Secondary (9-10)" = 3,
                "SLC (11)" = 4,
                "CLASS 12/Intermediate level (12)" = 5,
                "Bachelor/Postgraduate level" = 6,
                "Literate, but never attended school" = 7,
                "Illiterate, and never attended school"= 8,
                "Does not apply"= 9,
                "Don't Know"= 10,
                "NA"= 11)
mydata <- ordinal_recode (variable="HC4_1", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## HC4_1. What is the highest completed education level of your spouse?  [You do not need 
##                               CLASS 1                               CLASS 2 
##                                    15                                    48 
##                               CLASS 3                               CLASS 4 
##                                    48                                    68 
##                               CLASS 5                               CLASS 6 
##                                   114                                    57 
##                               CLASS 7                               CLASS 8 
##                                    77                                   114 
##                               CLASS 9                              CLASS 10 
##                                    69                                    86 
##                                   SLC           CLASS 12/Intermediate level 
##                                   188                                   126 
##                        Bachelor level  Post-Secondary Level (e.g., MA, PhD) 
##                                    34                                    11 
##   Literate, but never attended school Illiterate, and never attended school 
##                                   233                                   277 
##                        Does not apply                            Don't know 
##                                     4                                     3 
##         recoded
##          [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
##   1         15     0      0       0       0       0       0        0         0
##   2         48     0      0       0       0       0       0        0         0
##   3         48     0      0       0       0       0       0        0         0
##   4         68     0      0       0       0       0       0        0         0
##   5        114     0      0       0       0       0       0        0         0
##   6          0    57      0       0       0       0       0        0         0
##   7          0    77      0       0       0       0       0        0         0
##   8          0   114      0       0       0       0       0        0         0
##   9          0     0     69       0       0       0       0        0         0
##   10         0     0     86       0       0       0       0        0         0
##   11         0     0      0     188       0       0       0        0         0
##   12         0     0      0       0     126       0       0        0         0
##   13         0     0      0       0       0      34       0        0         0
##   14         0     0      0       0       0      11       0        0         0
##   15         0     0      0       0       0       0     233        0         0
##   16         0     0      0       0       0       0       0      277         0
##   888        0     0      0       0       0       0       0        0         0
##   999        0     0      0       0       0       0       0        0         0
##   999999     0     0      0       0       0       0       0        0         0
##         recoded
##          [888,999) [999,1e+06)
##   1              0           0
##   2              0           0
##   3              0           0
##   4              0           0
##   5              0           0
##   6              0           0
##   7              0           0
##   8              0           0
##   9              0           0
##   10             0           0
##   11             0           0
##   12             0           0
##   13             0           0
##   14             0           0
##   15             0           0
##   16             0           0
##   888            4           0
##   999            0           3
##   999999         0         439
## [1] "Frequency table after encoding"
## HC4_1. What is the highest completed education level of your spouse?  [You do not need 
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                   293                                   248 
##                      Secondary (9-10)                              SLC (11) 
##                                   155                                   188 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                   126                                    45 
##   Literate, but never attended school Illiterate, and never attended school 
##                                   233                                   277 
##                            Don't Know                                    NA 
##                                     4                                   442 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                     1                                     2 
##                      Secondary (9-10)                              SLC (11) 
##                                     3                                     4 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school 
##                                     7                                     8 
##                        Does not apply                            Don't Know 
##                                     9                                    10 
##                                    NA 
##                                    11
mydata <- ordinal_recode (variable="HC4_2", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## HC4_2. What is the highest completed education level of your father?  [You do not need 
##               Pre-school/Kindergarten                               CLASS 1 
##                                     1                                    12 
##                               CLASS 2                               CLASS 3 
##                                    27                                    31 
##                               CLASS 4                               CLASS 5 
##                                    23                                    61 
##                               CLASS 6                               CLASS 7 
##                                    15                                    21 
##                               CLASS 8                               CLASS 9 
##                                    44                                    22 
##                              CLASS 10                                   SLC 
##                                    26                                    47 
##           CLASS 12/Intermediate level                        Bachelor level 
##                                    29                                     4 
##  Post-Secondary Level (e.g., MA, PhD)   Literate, but never attended school 
##                                     2                                   121 
## Illiterate, and never attended school                     Refused to answer 
##                                   111                                     1 
##                        Does not apply                            Don't know 
##                                    51                                    11 
##         recoded
##          [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
##   0          1     0      0       0       0       0       0        0         0
##   1         12     0      0       0       0       0       0        0         0
##   2         27     0      0       0       0       0       0        0         0
##   3         31     0      0       0       0       0       0        0         0
##   4         23     0      0       0       0       0       0        0         0
##   5         61     0      0       0       0       0       0        0         0
##   6          0    15      0       0       0       0       0        0         0
##   7          0    21      0       0       0       0       0        0         0
##   8          0    44      0       0       0       0       0        0         0
##   9          0     0     22       0       0       0       0        0         0
##   10         0     0     26       0       0       0       0        0         0
##   11         0     0      0      47       0       0       0        0         0
##   12         0     0      0       0      29       0       0        0         0
##   13         0     0      0       0       0       4       0        0         0
##   14         0     0      0       0       0       2       0        0         0
##   15         0     0      0       0       0       0     121        0         0
##   16         0     0      0       0       0       0       0      111         0
##   777        0     0      0       0       0       0       0        0         1
##   888        0     0      0       0       0       0       0        0         0
##   999        0     0      0       0       0       0       0        0         0
##   999999     0     0      0       0       0       0       0        0         0
##         recoded
##          [888,999) [999,1e+06)
##   0              0           0
##   1              0           0
##   2              0           0
##   3              0           0
##   4              0           0
##   5              0           0
##   6              0           0
##   7              0           0
##   8              0           0
##   9              0           0
##   10             0           0
##   11             0           0
##   12             0           0
##   13             0           0
##   14             0           0
##   15             0           0
##   16             0           0
##   777            0           0
##   888           51           0
##   999            0          11
##   999999         0        1351
## [1] "Frequency table after encoding"
## HC4_2. What is the highest completed education level of your father?  [You do not need 
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                   155                                    80 
##                      Secondary (9-10)                              SLC (11) 
##                                    48                                    47 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                    29                                     6 
##   Literate, but never attended school Illiterate, and never attended school 
##                                   121                                   111 
##                        Does not apply                            Don't Know 
##                                     1                                    51 
##                                    NA 
##                                  1362 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                     1                                     2 
##                      Secondary (9-10)                              SLC (11) 
##                                     3                                     4 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school 
##                                     7                                     8 
##                        Does not apply                            Don't Know 
##                                     9                                    10 
##                                    NA 
##                                    11
mydata <- ordinal_recode (variable="HC4_3", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
##                               CLASS 1                               CLASS 2 
##                                     3                                    13 
##                               CLASS 3                               CLASS 4 
##                                    10                                    21 
##                               CLASS 5                               CLASS 6 
##                                    24                                    13 
##                               CLASS 7                               CLASS 8 
##                                     6                                    20 
##                               CLASS 9                              CLASS 10 
##                                     5                                     9 
##                                   SLC           CLASS 12/Intermediate level 
##                                    23                                     5 
##                        Bachelor level   Literate, but never attended school 
##                                     1                                   225 
## Illiterate, and never attended school                        Does not apply 
##                                   259                                    20 
##                            Don't know 
##                                     3 
##         recoded
##          [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
##   1          3     0      0       0       0       0       0        0         0
##   2         13     0      0       0       0       0       0        0         0
##   3         10     0      0       0       0       0       0        0         0
##   4         21     0      0       0       0       0       0        0         0
##   5         24     0      0       0       0       0       0        0         0
##   6          0    13      0       0       0       0       0        0         0
##   7          0     6      0       0       0       0       0        0         0
##   8          0    20      0       0       0       0       0        0         0
##   9          0     0      5       0       0       0       0        0         0
##   10         0     0      9       0       0       0       0        0         0
##   11         0     0      0      23       0       0       0        0         0
##   12         0     0      0       0       5       0       0        0         0
##   13         0     0      0       0       0       1       0        0         0
##   15         0     0      0       0       0       0     225        0         0
##   16         0     0      0       0       0       0       0      259         0
##   888        0     0      0       0       0       0       0        0         0
##   999        0     0      0       0       0       0       0        0         0
##   999999     0     0      0       0       0       0       0        0         0
##         recoded
##          [888,999) [999,1e+06)
##   1              0           0
##   2              0           0
##   3              0           0
##   4              0           0
##   5              0           0
##   6              0           0
##   7              0           0
##   8              0           0
##   9              0           0
##   10             0           0
##   11             0           0
##   12             0           0
##   13             0           0
##   15             0           0
##   16             0           0
##   888           20           0
##   999            0           3
##   999999         0        1351
## [1] "Frequency table after encoding"
## HC4_3. What is the highest completed education level of your mother? If you have more t
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                    71                                    39 
##                      Secondary (9-10)                              SLC (11) 
##                                    14                                    23 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     5                                     1 
##   Literate, but never attended school Illiterate, and never attended school 
##                                   225                                   259 
##                            Don't Know                                    NA 
##                                    20                                  1354 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                     1                                     2 
##                      Secondary (9-10)                              SLC (11) 
##                                     3                                     4 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school 
##                                     7                                     8 
##                        Does not apply                            Don't Know 
##                                     9                                    10 
##                                    NA 
##                                    11
mydata <- ordinal_recode (variable="HC4_4", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
##                               CLASS 2                               CLASS 3 
##                                     2                                     2 
##                               CLASS 4                               CLASS 5 
##                                     1                                     2 
##                               CLASS 9                                   SLC 
##                                     1                                     1 
##           CLASS 12/Intermediate level   Literate, but never attended school 
##                                     2                                    29 
## Illiterate, and never attended school                            Don't know 
##                                    56                                     3 
##         recoded
##          [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888)
##   2          2     0      0       0       0       0       0        0         0
##   3          2     0      0       0       0       0       0        0         0
##   4          1     0      0       0       0       0       0        0         0
##   5          2     0      0       0       0       0       0        0         0
##   9          0     0      1       0       0       0       0        0         0
##   11         0     0      0       1       0       0       0        0         0
##   12         0     0      0       0       2       0       0        0         0
##   15         0     0      0       0       0       0      29        0         0
##   16         0     0      0       0       0       0       0       56         0
##   999        0     0      0       0       0       0       0        0         0
##   999999     0     0      0       0       0       0       0        0         0
##         recoded
##          [888,999) [999,1e+06)
##   2              0           0
##   3              0           0
##   4              0           0
##   5              0           0
##   9              0           0
##   11             0           0
##   12             0           0
##   15             0           0
##   16             0           0
##   999            0           3
##   999999         0        1912
## [1] "Frequency table after encoding"
## HC4_4. Think about your grandparents, and the grandparent with the most education. What
##                 Primary or less (0-5)                      Secondary (9-10) 
##                                     7                                     1 
##                              SLC (11)      CLASS 12/Intermediate level (12) 
##                                     1                                     2 
##   Literate, but never attended school Illiterate, and never attended school 
##                                    29                                    56 
##                                    NA 
##                                  1915 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                     1                                     2 
##                      Secondary (9-10)                              SLC (11) 
##                                     3                                     4 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school 
##                                     7                                     8 
##                        Does not apply                            Don't Know 
##                                     9                                    10 
##                                    NA 
##                                    11
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999999, value_labels=labels_edu)

## [1] "Frequency table before encoding"
## D_4. What is your highest completed education level?  [You do not need to read the re
##               Pre-school/Kindergarten                               CLASS 1 
##                                     1                                    31 
##                               CLASS 2                               CLASS 3 
##                                    54                                    71 
##                               CLASS 4                               CLASS 5 
##                                    73                                   149 
##                               CLASS 6                               CLASS 7 
##                                    69                                    85 
##                               CLASS 8                               CLASS 9 
##                                   120                                    84 
##                              CLASS 10                                   SLC 
##                                   102                                   296 
##           CLASS 12/Intermediate level                        Bachelor level 
##                                   264                                    62 
##  Post-Secondary Level (e.g., MA, PhD)   Literate, but never attended school 
##                                     9                                   304 
## Illiterate, and never attended school 
##                                   237 
##     recoded
##      [0,6) [6,9) [9,11) [11,12) [12,13) [13,15) [15,16) [16,777) [777,888) [888,999)
##   0      1     0      0       0       0       0       0        0         0         0
##   1     31     0      0       0       0       0       0        0         0         0
##   2     54     0      0       0       0       0       0        0         0         0
##   3     71     0      0       0       0       0       0        0         0         0
##   4     73     0      0       0       0       0       0        0         0         0
##   5    149     0      0       0       0       0       0        0         0         0
##   6      0    69      0       0       0       0       0        0         0         0
##   7      0    85      0       0       0       0       0        0         0         0
##   8      0   120      0       0       0       0       0        0         0         0
##   9      0     0     84       0       0       0       0        0         0         0
##   10     0     0    102       0       0       0       0        0         0         0
##   11     0     0      0     296       0       0       0        0         0         0
##   12     0     0      0       0     264       0       0        0         0         0
##   13     0     0      0       0       0      62       0        0         0         0
##   14     0     0      0       0       0       9       0        0         0         0
##   15     0     0      0       0       0       0     304        0         0         0
##   16     0     0      0       0       0       0       0      237         0         0
##     recoded
##      [999,1e+06)
##   0            0
##   1            0
##   2            0
##   3            0
##   4            0
##   5            0
##   6            0
##   7            0
##   8            0
##   9            0
##   10           0
##   11           0
##   12           0
##   13           0
##   14           0
##   15           0
##   16           0
## [1] "Frequency table after encoding"
## D_4. What is your highest completed education level?  [You do not need to read the re
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                   379                                   274 
##                      Secondary (9-10)                              SLC (11) 
##                                   186                                   296 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                   264                                    71 
##   Literate, but never attended school Illiterate, and never attended school 
##                                   304                                   237 
## [1] "Inspect value labels and relabel as necessary"
##                 Primary or less (0-5)                 Lower secondary (6-8) 
##                                     1                                     2 
##                      Secondary (9-10)                              SLC (11) 
##                                     3                                     4 
##      CLASS 12/Intermediate level (12)           Bachelor/Postgraduate level 
##                                     5                                     6 
##   Literate, but never attended school Illiterate, and never attended school 
##                                     7                                     8 
##                        Does not apply                            Don't Know 
##                                     9                                    10 
##                                    NA 
##                                    11
# Re-run to check 2-anonimity

sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 2011 rows and 1400 variables.
##   --> Categorical key variables: D_4, IDR3_20, con1A_gender_I1
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##     Key Variable Number of categories     Mean size            Size of smallest (>0)
##              D_4                    8 (8)   251.375  (251.375)                    71
##          IDR3_20                    6 (6)   335.167  (335.167)                    37
##  con1A_gender_I1                    2 (2)  1005.500 (1005.500)                   997
##       
##   (71)
##   (37)
##  (997)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 6 (0.298%)
##   - 3-anonymity: 14 (0.696%)
##   - 5-anonymity: 51 (2.536%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## Registered S3 method overwritten by 'cli':
##   method     from         
##   print.boxx spatstat.geom
## # A tibble: 6 x 3
##                                       D_4          IDR3_20 con1A_gender_I1
##                                 <dbl+lbl>        <dbl+lbl>       <dbl+lbl>
## 1 4 [SLC (11)]                            6 [65 and older]      2 [female]
## 2 6 [Bachelor/Postgraduate level]         4 [45-54]             1 [male]  
## 3 6 [Bachelor/Postgraduate level]         5 [55-64]             2 [female]
## 4 7 [Literate, but never attended school] 1 [15-24]             2 [female]
## 5 7 [Literate, but never attended school] 1 [15-24]             1 [male]  
## 6 3 [Secondary (9-10)]                    6 [65 and older]      2 [female]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
##      D_4 IDR3_20 con1A_gender_I1
## 176   NA       6               2
## 307   NA       4               1
## 492   NA       5               2
## 1154  NA       1               2
## 1435  NA       1               1
## 1529  NA       6               2
mydata [notAnon,"D_4"] <- 11

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("H2_12_TEXT_Translation",
               "HTNx3_2_14_TEXT_Translation",
               "HTN_5x3_TEXT_Translation",
               "HTV_1_10_TEXTx3_Translation",
               "HTV_3_11_TEXTx3_Translation",
               "CPR5i_TEXT_Translation",
               "G1_00_08_TEXT_Translation",
               "P13A_10_TEXT_Translation",
               "P14A_12_TEXT_Translation",
               "SIMPOC7A_10_TEXT_Translation",
               "P13B_10_TEXT_Translation",
               "P14B_12_TEXT_Translation",
               "SIMPOC7B_10_TEXT_Translation",
               "P13C_10_TEXT_I1_Translation",
               "P14C_12_TEXT_I1_Translation",
               "SIMPOC7C_10_TEXT_I1_Translation",
               "P14C_12_TEXT_I2_Translation",
               "P13D_10_TEXT_I1_Translation",
               "P14D_12_TEXT_I1_Translation",
               "P14D_12_TEXT_I2_Translation",
               "P13E_10_TEXT_I1_Translation",
               "P14E_12_TEXT_I1_Translation",
               "SIMPOC7E_10_TEXT_I1_Translation",
               "P14E_12_TEXT_I2_Translation",
               "P14E_12_TEXT_I3_Translation",
               "NEW_3_12_TEXT_Translation",
               "NEW_9_TEXT_Translation",
               "SIMPOC7_cl_10_TEXT_I1_Translate",
               "SIMPOC7_cl_10_TEXT_I2_Translate",
               "NEW_10_TEXT_Translation",
               "P13_cl_O3_TEXT_I1_Translation",
               "NEW_9_cl_TEXT_I1_Translation",
               "NEW_9_cl_TEXT_I2_Translation",
               "NEW_9_cl_TEXT_I3_Translation",
               "P14_cl_O2_I1_TEXT_Translation",
               "P13_cl_O2_TEXT_I2_Translation",
               "SIMPOC7_cl_10_TEXT_I3_Translate",
               "P14_cl_O1_I3_TEXT_Translation",
               "P14_cl_O1_I2_TEXT_Translation",
               "e3e_TEXT_Translation",
               "E2_11_8_TEXT_Translation",
               "E_14_7_TEXT_Translation")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$E_14_7_TEXT_Translation[1313] <- "Respondent's bother was tricked in bad activities and later threatened to help [activity redacted]"
mydata$E_14_7_TEXT_Translation[1694] <- "In Q64, respondent said there was no income and later in Q307 respondent said [amount redacted] so entered the option more than 12,000 in Q307"
mydata$E_14_7_TEXT_Translation[1907] <- "GPS did not capture for about 20 minutes and started the interview without GPS. In Q64 respondent did not have any income but her/his son sent [amount redacted] the other day"
mydata$NEW_9_TEXT_Translation[1895] <- "Make [ocuppation redacted]"
mydata$NEW_10_TEXT_Translation[1554] <- "Shop [type redacted]"

#mydata <- mydata[!names(mydata) %in% "SrvyrComment"]

GPS data: Displace

# Setup map

countrymap <- map_data("world") %>% filter(region=="Nepal")  #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")

# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement. 

gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 41 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics before displacement"
##    Longitude        Latitude    
##  Min.   :84.31   Min.   :26.85  
##  1st Qu.:85.02   1st Qu.:27.55  
##  Median :85.46   Median :27.64  
##  Mean   :85.34   Mean   :27.59  
##  3rd Qu.:85.61   3rd Qu.:27.72  
##  Max.   :86.15   Max.   :28.00  
##  NA's   :41      NA's   :41
## Warning: Removed 41 rows containing missing values (geom_point).

## Warning: Removed 41 rows containing missing values (geom_point).

## Warning: Removed 41 rows containing missing values (geom_point).

## Warning: Removed 41 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics after displacement"
##    Longitude        Latitude    
##  Min.   :84.27   Min.   :26.83  
##  1st Qu.:85.01   1st Qu.:27.54  
##  Median :85.46   Median :27.64  
##  Mean   :85.34   Mean   :27.59  
##  3rd Qu.:85.61   3rd Qu.:27.72  
##  Max.   :86.17   Max.   :28.04  
##  NA's   :41      NA's   :41     
## [1] "Processing time = 5.18069293498993"
gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 40 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics before displacement"
##  GPSinitial_LO   GPSinitial_LA  
##  Min.   :84.31   Min.   :26.85  
##  1st Qu.:85.03   1st Qu.:27.55  
##  Median :85.45   Median :27.65  
##  Mean   :85.34   Mean   :27.59  
##  3rd Qu.:85.61   3rd Qu.:27.72  
##  Max.   :86.15   Max.   :28.00  
##  NA's   :40      NA's   :40
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).

## Warning: Removed 40 rows containing missing values (geom_point).

## Warning: Removed 40 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics after displacement"
##  GPSinitial_LO   GPSinitial_LA  
##  Min.   :84.28   Min.   :26.84  
##  1st Qu.:85.02   1st Qu.:27.54  
##  Median :85.45   Median :27.64  
##  Mean   :85.34   Mean   :27.59  
##  3rd Qu.:85.60   3rd Qu.:27.73  
##  Max.   :86.19   Max.   :28.04  
##  NA's   :40      NA's   :40     
## [1] "Processing time = 5.33818366527557"
gps.vars <- c("gps_CEa_LO", "gps_CEa_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 281 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics before displacement"
##    gps_CEa_LO      gps_CEa_LA   
##  Min.   :84.31   Min.   :26.85  
##  1st Qu.:84.96   1st Qu.:27.56  
##  Median :85.45   Median :27.64  
##  Mean   :85.33   Mean   :27.59  
##  3rd Qu.:85.60   3rd Qu.:27.72  
##  Max.   :86.15   Max.   :28.00  
##  NA's   :281     NA's   :281
## Warning: Removed 281 rows containing missing values (geom_point).
## Warning: Removed 281 rows containing missing values (geom_point).

## Warning: Removed 281 rows containing missing values (geom_point).

## Warning: Removed 281 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics after displacement"
##    gps_CEa_LO      gps_CEa_LA   
##  Min.   :84.26   Min.   :26.82  
##  1st Qu.:84.98   1st Qu.:27.54  
##  Median :85.44   Median :27.63  
##  Mean   :85.33   Mean   :27.59  
##  3rd Qu.:85.60   3rd Qu.:27.72  
##  Max.   :86.17   Max.   :28.03  
##  NA's   :281     NA's   :281    
## [1] "Processing time = 4.2291198849678"
gps.vars <- c("gpsenumimp_LO", "gpsenumimp_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.
## Warning: Removed 274 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics before displacement"
##  gpsenumimp_LO   gpsenumimp_LA  
##  Min.   :84.31   Min.   :26.85  
##  1st Qu.:84.96   1st Qu.:27.56  
##  Median :85.45   Median :27.64  
##  Mean   :85.33   Mean   :27.59  
##  3rd Qu.:85.60   3rd Qu.:27.72  
##  Max.   :86.15   Max.   :28.00  
##  NA's   :274     NA's   :274
## Warning: Removed 274 rows containing missing values (geom_point).
## Warning: Removed 274 rows containing missing values (geom_point).

## Warning: Removed 274 rows containing missing values (geom_point).

## Warning: Removed 274 rows containing missing values (geom_point).

## [1] "Summary Long/Lat statistics after displacement"
##  gpsenumimp_LO   gpsenumimp_LA  
##  Min.   :84.28   Min.   :26.84  
##  1st Qu.:84.98   1st Qu.:27.55  
##  Median :85.44   Median :27.64  
##  Mean   :85.32   Mean   :27.59  
##  3rd Qu.:85.60   3rd Qu.:27.72  
##  Max.   :86.19   Max.   :28.04  
##  NA's   :274     NA's   :274    
## [1] "Processing time = 3.88036923011144"

Save processed data in stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))