rm(list=ls(all=t))

Setup filenames

filename <- "bhsection2" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!! No Direct PII

Direct PII-team: Encode field team names

# !!! No Direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

dropvars <- c("dise") 
mydata <- mydata[!names(mydata) %in% dropvars]

locvars <- c("q006_block_id", "q007_vlg_id") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## q006_block_id. 6 Block Code
##    1    2    3    4    5    6    7    8    9 <NA> 
## 1265 1042 1343 2751  585 1336  915 2905 3681  217 
## [1] "Frequency table after encoding"
## q006_block_id. 6 Block Code
##  279  280  281  282  283  284  285  286  287 <NA> 
## 1265 3681  585 1042 2905 1336 2751 1343  915  217 
## [1] "Frequency table before encoding"
## q007_vlg_id. 7 Village Code
##    1    2    3    4    5    6    7    9   10   11   12   13   15   16   17   18   19   20 
##  133  105   92  106  149  203  175  101   93  131  166  132   92  107  139  106  125  128 
##   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38 
##  195  158  131  135  236  191  173  139  109  106  181  164  143  120  260  139  118  138 
##   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56 
##  175  116  108  108  118  186  143  147  119  109  112   94  103  129  232  158  170  151 
##   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74 
##  120  132   79  159  180  102  136  109  211  117  155  122  177   98  100  137  113  156 
##   75   76   77   78   80   81   82   83   84   85   87   88   89   90   91   92   93   94 
##  131  107  150  224  186  119  164   98  115   89  116  169   82  118  127  143  133   83 
##   95   96   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
##  116  172  160  129  187   98  160  149   93  127  110  224  107  173  135  127  132  189 
##  113  114  115  116  117  118  119 <NA> 
##   87  157  129   92  173  141  102  217 
## [1] "Frequency table after encoding"
## q007_vlg_id. 7 Village Code
##  265  266  267  268  269  270  271  272  273  274  275  276  277  278  279  280  281  282 
##   79  100  181  186  107  139  127  107  156  129  166  115  160  127  128  139  189  108 
##  283  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298  299  300 
##  120  131  143  195  138  109   93  120  160  106   87  139  172  132   82  143   93  108 
##  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318 
##  137  133  113  131  155  107  159   92  136  116  102  149  151  118   92  127  236  132 
##  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336 
##  122  149  112  109   89  164   94  186  133  141  135  116   83  177  170   92  105  173 
##  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354 
##  110  116  180  150  173  119  224  101  211  106  118  147  164  129  118  187  109  135 
##  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372 
##  158  191  125  169  158  131  232   98  173  175  203   98  157   98  106  103  175  129 
##  373  374  375  376  377  378  379 <NA> 
##  143  132  260  224  102  119  117  217

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

mydata <- top_recode (variable="q203_age", break_point=80, missing=NA)
## [1] "Frequency table before encoding"
## q203_age. 203 How old is ?
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
##   80  194  240  291  269  445  409  528  562  610 1187  789  889  655  536  503  422  310 
##   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35 
##  339  137  210   93  116   78   67  201   93   86  190   70  622   59  292  149  113  864 
##   36   37   38   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53 
##  119  110  215   69  627   32  117   52   28  329   34   26   48   18  234   13   33   13 
##   54   55   56   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71 
##   11  166   25   17   47   10  290   15   31   23    6  148   14    8   25    6  146    6 
##   72   73   74   75   76   77   78   79   80   82   83   85   86   88   90   91   92   93 
##   19    6    3   31    8    2    5    5   56    3    2   22    1    1   18    1    2    1 
##   95   97   99  100  102  108 <NA> 
##    3    1    1    1    1    1   37

## [1] "Frequency table after encoding"
## q203_age. 203 How old is ?
##          0          1          2          3          4          5          6          7 
##         80        194        240        291        269        445        409        528 
##          8          9         10         11         12         13         14         15 
##        562        610       1187        789        889        655        536        503 
##         16         17         18         19         20         21         22         23 
##        422        310        339        137        210         93        116         78 
##         24         25         26         27         28         29         30         31 
##         67        201         93         86        190         70        622         59 
##         32         33         34         35         36         37         38         39 
##        292        149        113        864        119        110        215         69 
##         40         41         42         43         44         45         46         47 
##        627         32        117         52         28        329         34         26 
##         48         49         50         51         52         53         54         55 
##         48         18        234         13         33         13         11        166 
##         56         57         58         59         60         61         62         63 
##         25         17         47         10        290         15         31         23 
##         64         65         66         67         68         69         70         71 
##          6        148         14          8         25          6        146          6 
##         72         73         74         75         76         77         78         79 
##         19          6          3         31          8          2          5          5 
## 80 or more       <NA> 
##        115         37

mydata <- top_recode (variable="q208_age", break_point=12, missing=NA)
## [1] "Frequency table before encoding"
## q208_age. 208 At what age did  first enroll in school?
##  age    2    3    4    5    6    7    8    9   10   11   12   13   14   16   17   18   30 
##    6   50  682  782 4307 2479  578  215   64   59   10   20    4    6    2    1    1    1 
##   37   40   78 <NA> 
##    1    2    1 6769

## [1] "Frequency table after encoding"
## q208_age. 208 At what age did  first enroll in school?
##        age          2          3          4          5          6          7          8 
##          6         50        682        782       4307       2479        578        215 
##          9         10         11 12 or more       <NA> 
##         64         59         10         39       6769

mydata <- top_recode (variable="q217_fee", break_point=25000, missing=NA)
## [1] "Frequency table before encoding"
## q217_fee. 217 How much did you have to pay in fees to ’s school in order to enroll him/her
##      0      1      2      3      5      6      7      8     10     11     12     15     20 
##   4558      1     12      1     11      2      1      1     59     14      9     12    172 
##     21     22     25     30     35     40     42     50     51     52     60     65     70 
##      6      2     21     46      1     13      2    178      4      1      5      6      5 
##     75     80    100    110    115    120    125    130    135    140    150    160    165 
##      2      1    151      5      1      6      1      6      1      2     73      3      2 
##    170    200    210    220    225    230    250    260    265    270    275    280    290 
##      3     87      2      3      1      1     52      2      3      2      8      1      1 
##    300    307    315    320    325    327    330    345    350    355    360    365    370 
##     86      1      1      1      1      1      2      1     44      3      9      8      4 
##    375    380    385    400    420    425    450    460    463    465    470    475    480 
##      3      1      1     33      1      1     34      3      1      1      1      2      1 
##    490    500    510    520    525    550    560    575    600    615    650    665    700 
##      1    143      1      1      2     17      1      2     22      3      7      6     12 
##    720    735    750    800    850    870    900    950    960   1000   1010   1020   1050 
##      2      2      6     11      4      2      1      2      1     49      1      1      1 
##   1100   1150   1160   1200   1250   1300   1400   1450   1500   1530   1551   1560   1580 
##      6      2      1     19      2      8      2      1     33      2      1      3      1 
##   1600   1620   1700   1720   1780   1800   1900   2000   2100   2200   2300   2400   2500 
##      4      1      3      1      1      6      1     50      2      3      1     10     21 
##   2700   2800   2900   3000   3050   3100   3300   3360   3500   3550   3600   3700   3800 
##      3      2      1     50      1      1      2      1     21      1     17      1      3 
##   3900   3960   4000   4200   4440   4500   4560   4680   4770   4800   4850   5000   5200 
##      3      1     31      8      1      9      1      1      1      8      1     43      1 
##   5400   5500   5540   5600   6000   6100   6480   6500   6550   6600   6640   7000   7200 
##      3      5      1      1     38      2      1      3      1      2      1     19      3 
##   7400   7500   7700   7800   8000   8400   8500   8900   9000   9100   9500  10000  11000 
##      1      2      1      2     27      1      2      1      7      1      1     17      1 
##  11130  11700  12000  12050  12500  13000  13500  14000  14400  15000  16000  16800  20000 
##      1      1      9      1      3      1      1      2      1      9      1      1      2 
##  22000  24000  25000  26000  27000  30000  35000  36000  40000  42000  45000  48000  50000 
##      2     11      4      1      1      7      3      1      1      1      2      1      1 
##  60000  70000 150000 195000   <NA> 
##      1      1      1      1   9316

## [1] "Frequency table after encoding"
## q217_fee. 217 How much did you have to pay in fees to ’s school in order to enroll him/her
##             0             1             2             3             5             6 
##          4558             1            12             1            11             2 
##             7             8            10            11            12            15 
##             1             1            59            14             9            12 
##            20            21            22            25            30            35 
##           172             6             2            21            46             1 
##            40            42            50            51            52            60 
##            13             2           178             4             1             5 
##            65            70            75            80           100           110 
##             6             5             2             1           151             5 
##           115           120           125           130           135           140 
##             1             6             1             6             1             2 
##           150           160           165           170           200           210 
##            73             3             2             3            87             2 
##           220           225           230           250           260           265 
##             3             1             1            52             2             3 
##           270           275           280           290           300           307 
##             2             8             1             1            86             1 
##           315           320           325           327           330           345 
##             1             1             1             1             2             1 
##           350           355           360           365           370           375 
##            44             3             9             8             4             3 
##           380           385           400           420           425           450 
##             1             1            33             1             1            34 
##           460           463           465           470           475           480 
##             3             1             1             1             2             1 
##           490           500           510           520           525           550 
##             1           143             1             1             2            17 
##           560           575           600           615           650           665 
##             1             2            22             3             7             6 
##           700           720           735           750           800           850 
##            12             2             2             6            11             4 
##           870           900           950           960          1000          1010 
##             2             1             2             1            49             1 
##          1020          1050          1100          1150          1160          1200 
##             1             1             6             2             1            19 
##          1250          1300          1400          1450          1500          1530 
##             2             8             2             1            33             2 
##          1551          1560          1580          1600          1620          1700 
##             1             3             1             4             1             3 
##          1720          1780          1800          1900          2000          2100 
##             1             1             6             1            50             2 
##          2200          2300          2400          2500          2700          2800 
##             3             1            10            21             3             2 
##          2900          3000          3050          3100          3300          3360 
##             1            50             1             1             2             1 
##          3500          3550          3600          3700          3800          3900 
##            21             1            17             1             3             3 
##          3960          4000          4200          4440          4500          4560 
##             1            31             8             1             9             1 
##          4680          4770          4800          4850          5000          5200 
##             1             1             8             1            43             1 
##          5400          5500          5540          5600          6000          6100 
##             3             5             1             1            38             2 
##          6480          6500          6550          6600          6640          7000 
##             1             3             1             2             1            19 
##          7200          7400          7500          7700          7800          8000 
##             3             1             2             1             2            27 
##          8400          8500          8900          9000          9100          9500 
##             1             2             1             7             1             1 
##         10000         11000         11130         11700         12000         12050 
##            17             1             1             1             9             1 
##         12500         13000         13500         14000         14400         15000 
##             3             1             1             2             1             9 
##         16000         16800         20000         22000         24000 25000 or more 
##             1             1             2             2            11            27 
##          <NA> 
##          9316

mydata <- top_recode (variable="q218_edu_cost", break_point=25000, missing=NA)
## [1] "Frequency table before encoding"
## q218_edu_cost. 218 In the last 12 months, how much has this household spent out of pocket for '
##      0      5     13     14     50     60    100    110    120    150    180    200    220 
##     42      1      1      1      2      2      8      3      1      9      1     34      1 
##    250    265    270    300    335    350    360    365    400    420    450    500    510 
##      6      1      2     45      1      6      3      2     53      1      6    328      1 
##    520    540    550    580    590    600    630    640    650    660    680    700    750 
##      2      3     14      1      1    100      1      1     10      3      1     82      5 
##    760    780    800    820    850    900    930    950    960   1000   1025   1050   1060 
##      1      1    109      4      6     20      1      6      1   1001      1      1      1 
##   1100   1150   1160   1200   1250   1300   1340   1400   1450   1500   1600   1650   1700 
##     17      3      1    155      8     13      1      3      3    749      7      1      3 
##   1800   1900   1950   2000   2100   2150   2200   2300   2400   2500   2600   2650   2700 
##     17      1      3   1192      3      1      7      1      5    289      1      1      4 
##   2800   3000   3250   3333   3400   3500   3600   4000   4200   4500   4600   4800   5000 
##      1    737      1      4      1     34     11    234      1      5      1      3    472 
##   5200   5500   5770   5800   6000   6120   6500   6900   7000   7300   7500   7800   8000 
##      1      2      1      1    145      1      1      1     67      1      1      1     73 
##   8400   8500   8600   8700   9000   9600   9800  10000  11000  11200  12000  12200  13000 
##      2      1      1      1      9      2      1    143      4      1     39      1      5 
##  14000  15000  16000  18000  19000  19500  20000  21650  22000  23000  24000  25000  30000 
##      3     56      7      2      1      1     24      1      2      1      2     11     10 
##  36000  40000  48000  50000  60000  61200  65000  70000  72000  1e+05 105000   <NA> 
##      4      3      1      7      1      1      2      3      1      4      1   9453

## [1] "Frequency table after encoding"
## q218_edu_cost. 218 In the last 12 months, how much has this household spent out of pocket for '
##             0             5            13            14            50            60 
##            42             1             1             1             2             2 
##           100           110           120           150           180           200 
##             8             3             1             9             1            34 
##           220           250           265           270           300           335 
##             1             6             1             2            45             1 
##           350           360           365           400           420           450 
##             6             3             2            53             1             6 
##           500           510           520           540           550           580 
##           328             1             2             3            14             1 
##           590           600           630           640           650           660 
##             1           100             1             1            10             3 
##           680           700           750           760           780           800 
##             1            82             5             1             1           109 
##           820           850           900           930           950           960 
##             4             6            20             1             6             1 
##          1000          1025          1050          1060          1100          1150 
##          1001             1             1             1            17             3 
##          1160          1200          1250          1300          1340          1400 
##             1           155             8            13             1             3 
##          1450          1500          1600          1650          1700          1800 
##             3           749             7             1             3            17 
##          1900          1950          2000          2100          2150          2200 
##             1             3          1192             3             1             7 
##          2300          2400          2500          2600          2650          2700 
##             1             5           289             1             1             4 
##          2800          3000          3250          3333          3400          3500 
##             1           737             1             4             1            34 
##          3600          4000          4200          4500          4600          4800 
##            11           234             1             5             1             3 
##          5000          5200          5500          5770          5800          6000 
##           472             1             2             1             1           145 
##          6120          6500          6900          7000          7300          7500 
##             1             1             1            67             1             1 
##          7800          8000          8400          8500          8600          8700 
##             1            73             2             1             1             1 
##          9000          9600          9800         10000         11000         11200 
##             9             2             1           143             4             1 
##         12000         12200         13000         14000         15000         16000 
##            39             1             5             3            56             7 
##         18000         19000         19500         20000         21650         22000 
##             2             1             1            24             1             2 
##         23000         24000 25000 or more          <NA> 
##             1             2            49          9453

mydata <- top_recode (variable="q221_age_stop_schl", break_point=25, missing=NA)
## [1] "Frequency table before encoding"
## q221_age_stop_schl. 221 At what age did  stop attending school?
##     0     1     5     6     7     8     9    10    11    12    13    14    15    16    17 
##     2     1     7    22    34    87   104   184   223   253   230   266   256   222   154 
##    18    19    20    21    22    23    24    25    26    27    28    29    30    35    40 
##   120    46    47    21    17     5     7     9     2     3     2     2     4     1     1 
##  <NA> 
## 13708

## [1] "Frequency table after encoding"
## q221_age_stop_schl. 221 At what age did  stop attending school?
##          0          1          5          6          7          8          9         10 
##          2          1          7         22         34         87        104        184 
##         11         12         13         14         15         16         17         18 
##        223        253        230        266        256        222        154        120 
##         19         20         21         22         23         24 25 or more       <NA> 
##         46         47         21         17          5          7         24      13708

mydata <- top_recode (variable="q224_income", break_point=10000, missing=NA)
## [1] "Frequency table before encoding"
## q224_income. 224 How much income in cash or in kind did  earn from employment in the last 7 d
##      0     21     28     35     50     60     63     70     71     84    100    105    108 
##    207      3      2      1      1      1      5      7      1      5     16      1      1 
##    112    120    125    133    135    140    150    160    161    162    165    168    170 
##      2      7      2      1      1     19     17      2      1      1      1      1      1 
##    175    180    190    200    210    214    225    231    233    240    245    250    270 
##      5      4      1     60     21      1      2      5      1      1      3     17      1 
##    280    300    314    335    350    360    390    396    400    420    450    466    480 
##      9     48      1      1     48      2      2      1     69     18     10      2      2 
##    490    495    500    510    525    539    540    550    560    571    581    600    625 
##      9      1     58      1      1      1      2      2     12      1      2    114      1 
##    630    650    654    660    700    714    720    750    770    790    800    812    817 
##      3      3      1      1    313      2      5     38      3      1     47      1      1 
##    819    825    840    850    857    875    900    910    931    933    938    945    950 
##      1      1     20     12      2      2     42      4      3      2      3      1      3 
##    960    996   1000   1005   1010   1025   1050   1100   1125   1148   1150   1162   1166 
##      1      2    116      2      1      5    143      5      2      1      1      5      2 
##   1190   1200   1225   1243   1250   1260   1295   1310   1320   1323   1330   1350   1360 
##      3     78      1      1     20      8      1      2      1      2      2      1      1 
##   1395   1400   1428   1450   1470   1500   1512   1520   1550   1585   1596   1600   1610 
##      1    776      3      4      1     77      1      1      1      1      1     15      3 
##   1631   1633   1650   1675   1680   1700   1750   1800   1806   1862   1866   1890   1900 
##      6      6      4      1      2      8     87     28      1      1      2      3      2 
##   2000   2010   2050   2100   2150   2200   2250   2300   2331   2333   2350   2400   2450 
##     44      1      1    528      1      1      4      2      2      2      1     17     39 
##   2500   2566   2600   2625   2665   2700   2800   3000   3150   3200   3250   3500   3600 
##     24      1      3      1      1      2    112     30      4      5      3    186      7 
##   3800   3850   4000   4200   4300   4400   4500   4550   4800   4900   5000   5250   5362 
##      1      5      2     26      1      1      3      1      1      6      6      1      1 
##   5538   5600   6000   6300   6400   6500   6566   7000   7142   7200   7500   7700   8000 
##      1      6      1      1      1      4      1     12      1      1      3      2      4 
##   8400   8631   9165   9333  10000  10500  12000  12600  14000  15000  15900  16500  21500 
##      2      1      1      1      1      4      3      2      2      4      1      1      1 
##  24500  28000  34000 714200   <NA> 
##      2      2      1      1  12090

## [1] "Frequency table after encoding"
## q224_income. 224 How much income in cash or in kind did  earn from employment in the last 7 d
##             0            21            28            35            50            60 
##           207             3             2             1             1             1 
##            63            70            71            84           100           105 
##             5             7             1             5            16             1 
##           108           112           120           125           133           135 
##             1             2             7             2             1             1 
##           140           150           160           161           162           165 
##            19            17             2             1             1             1 
##           168           170           175           180           190           200 
##             1             1             5             4             1            60 
##           210           214           225           231           233           240 
##            21             1             2             5             1             1 
##           245           250           270           280           300           314 
##             3            17             1             9            48             1 
##           335           350           360           390           396           400 
##             1            48             2             2             1            69 
##           420           450           466           480           490           495 
##            18            10             2             2             9             1 
##           500           510           525           539           540           550 
##            58             1             1             1             2             2 
##           560           571           581           600           625           630 
##            12             1             2           114             1             3 
##           650           654           660           700           714           720 
##             3             1             1           313             2             5 
##           750           770           790           800           812           817 
##            38             3             1            47             1             1 
##           819           825           840           850           857           875 
##             1             1            20            12             2             2 
##           900           910           931           933           938           945 
##            42             4             3             2             3             1 
##           950           960           996          1000          1005          1010 
##             3             1             2           116             2             1 
##          1025          1050          1100          1125          1148          1150 
##             5           143             5             2             1             1 
##          1162          1166          1190          1200          1225          1243 
##             5             2             3            78             1             1 
##          1250          1260          1295          1310          1320          1323 
##            20             8             1             2             1             2 
##          1330          1350          1360          1395          1400          1428 
##             2             1             1             1           776             3 
##          1450          1470          1500          1512          1520          1550 
##             4             1            77             1             1             1 
##          1585          1596          1600          1610          1631          1633 
##             1             1            15             3             6             6 
##          1650          1675          1680          1700          1750          1800 
##             4             1             2             8            87            28 
##          1806          1862          1866          1890          1900          2000 
##             1             1             2             3             2            44 
##          2010          2050          2100          2150          2200          2250 
##             1             1           528             1             1             4 
##          2300          2331          2333          2350          2400          2450 
##             2             2             2             1            17            39 
##          2500          2566          2600          2625          2665          2700 
##            24             1             3             1             1             2 
##          2800          3000          3150          3200          3250          3500 
##           112            30             4             5             3           186 
##          3600          3800          3850          4000          4200          4300 
##             7             1             5             2            26             1 
##          4400          4500          4550          4800          4900          5000 
##             1             3             1             1             6             6 
##          5250          5362          5538          5600          6000          6300 
##             1             1             1             6             1             1 
##          6400          6500          6566          7000          7142          7200 
##             1             4             1            12             1             1 
##          7500          7700          8000          8400          8631          9165 
##             3             2             4             2             1             1 
##          9333 10000 or more          <NA> 
##             1            25         12090

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("q206_rel",
                  "q209_miss_schl",
                  "q212_grade",
                  "q213_edu",
                  "q215_grade",
                  "q216_schl_type",
                  "q222_emp_status")
capture_tables (indirect_PII)

# Recode those with very specific values. 

break_rel <- c(1,2,3,4,5,99)
labels_rel <- c("Single/not committed" =1, 
                "Single, committed or engaged" =2, 
                "Currently Married and cohabitating" =3, 
                "Married but not cohabitating" =4, 
                "Divorced/Widowed/Widower" =5)
mydata <- ordinal_recode (variable="q206_rel", break_points=break_rel, missing=999999, value_labels=labels_rel)

## [1] "Frequency table before encoding"
## q206_rel. 206 Is 's relationship finalized?
##               Single/not committed       Single, committed or engaged 
##                               6435                                566 
## Currently Married and cohabitating       Married but not cohabitating 
##                               5705                               1110 
##                           Divorced                    Widowed/Widower 
##                                 17                                621 
##                               <NA> 
##                               1586 
##    recoded
##     [1,2) [2,3) [3,4) [4,5) [5,99) [99,1e+06)
##   1  6435     0     0     0      0          0
##   2     0   566     0     0      0          0
##   3     0     0  5705     0      0          0
##   4     0     0     0  1110      0          0
##   5     0     0     0     0     17          0
##   6     0     0     0     0    621          0
## [1] "Frequency table after encoding"
## q206_rel. 206 Is 's relationship finalized?
##               Single/not committed       Single, committed or engaged 
##                               6435                                566 
## Currently Married and cohabitating       Married but not cohabitating 
##                               5705                               1110 
##           Divorced/Widowed/Widower                               <NA> 
##                                638                               1586 
## [1] "Inspect value labels and relabel as necessary"
##               Single/not committed       Single, committed or engaged 
##                                  1                                  2 
## Currently Married and cohabitating       Married but not cohabitating 
##                                  3                                  4 
##           Divorced/Widowed/Widower 
##                                  5
val_labels(mydata$q215_grade)
##                                    Other: Specify 
##                                               -96 
## Never Attended School or Only Attended Pre-School 
##                                                 0 
##                                           Grade 1 
##                                                 1 
##                                           Grade 2 
##                                                 2 
##                                           Grade 3 
##                                                 3 
##                                           Grade 4 
##                                                 4 
##                                           Grade 5 
##                                                 5 
##                                           Grade 6 
##                                                 6 
##                                           Grade 7 
##                                                 7 
##                                           Grade 8 
##                                                 8 
##                                           Grade 9 
##                                                 9 
##                                          Grade 10 
##                                                10 
##                                          Grade 11 
##                                                11 
##                                          Grade 12 
##                                                12 
##                         University / Not Graduate 
##                                                13 
##                             University / Graduate 
##                                                14 
##                 Post Bachelors Tertiary Education 
##                                                15 
##                                 Technical traning 
##                                                16 
##                              Professional studies 
##                                                17 
##                                     Below primary 
##                                                18
break_edu <- c(0:14,18,19)
labels_edu <- c("Never Attended School or Only Attended Pre-School" = 1,
                "Grade 1" = 2,
                "Grade 2" = 3,
                "Grade 3" = 4,
                "Grade 4" = 5,
                "Grade 5" = 6,
                "Grade 6" = 7,
                "Grade 7" = 8,
                "Grade 8" = 9,
                "Grade 9" = 10,
                "Grade 10" = 11,
                "Grade 11" = 12,
                "Grade 12" = 13,
                "University / Not Graduate" = 14,
                "University / Graduate or higher" = 15,
                "Below primary" = 16)
mydata <- ordinal_recode (variable="q215_grade", 
                          break_points=break_edu, 
                          missing=999999, 
                          value_labels=labels_edu)

## [1] "Frequency table before encoding"
## q215_grade. 215 At what grade is  currently studying?
## Never Attended School or Only Attended Pre-School 
##                                                53 
##                                           Grade 1 
##                                               436 
##                                           Grade 2 
##                                               507 
##                                           Grade 3 
##                                               524 
##                                           Grade 4 
##                                               501 
##                                           Grade 5 
##                                              2631 
##                                           Grade 6 
##                                               395 
##                                           Grade 7 
##                                               462 
##                                           Grade 8 
##                                               408 
##                                           Grade 9 
##                                               309 
##                                          Grade 10 
##                                               208 
##                                          Grade 11 
##                                                99 
##                                          Grade 12 
##                                                98 
##                         University / Not Graduate 
##                                                66 
##                             University / Graduate 
##                                                10 
##                 Post Bachelors Tertiary Education 
##                                                 7 
##                                 Technical traning 
##                                                 6 
##                              Professional studies 
##                                                 2 
##                                     Below primary 
##                                                 2 
##                                              <NA> 
##                                              9316 
##     recoded
##      [0,1) [1,2) [2,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,9) [9,10) [10,11) [11,12) [12,13)
##   0     53     0     0     0     0     0     0     0     0      0       0       0       0
##   1      0   436     0     0     0     0     0     0     0      0       0       0       0
##   2      0     0   507     0     0     0     0     0     0      0       0       0       0
##   3      0     0     0   524     0     0     0     0     0      0       0       0       0
##   4      0     0     0     0   501     0     0     0     0      0       0       0       0
##   5      0     0     0     0     0  2631     0     0     0      0       0       0       0
##   6      0     0     0     0     0     0   395     0     0      0       0       0       0
##   7      0     0     0     0     0     0     0   462     0      0       0       0       0
##   8      0     0     0     0     0     0     0     0   408      0       0       0       0
##   9      0     0     0     0     0     0     0     0     0    309       0       0       0
##   10     0     0     0     0     0     0     0     0     0      0     208       0       0
##   11     0     0     0     0     0     0     0     0     0      0       0      99       0
##   12     0     0     0     0     0     0     0     0     0      0       0       0      98
##   13     0     0     0     0     0     0     0     0     0      0       0       0       0
##   14     0     0     0     0     0     0     0     0     0      0       0       0       0
##   15     0     0     0     0     0     0     0     0     0      0       0       0       0
##   16     0     0     0     0     0     0     0     0     0      0       0       0       0
##   17     0     0     0     0     0     0     0     0     0      0       0       0       0
##   18     0     0     0     0     0     0     0     0     0      0       0       0       0
##     recoded
##      [13,14) [14,18) [18,19) [19,1e+06)
##   0        0       0       0          0
##   1        0       0       0          0
##   2        0       0       0          0
##   3        0       0       0          0
##   4        0       0       0          0
##   5        0       0       0          0
##   6        0       0       0          0
##   7        0       0       0          0
##   8        0       0       0          0
##   9        0       0       0          0
##   10       0       0       0          0
##   11       0       0       0          0
##   12       0       0       0          0
##   13      66       0       0          0
##   14       0      10       0          0
##   15       0       7       0          0
##   16       0       6       0          0
##   17       0       2       0          0
##   18       0       0       2          0
## [1] "Frequency table after encoding"
## q215_grade. 215 At what grade is  currently studying?
## Never Attended School or Only Attended Pre-School 
##                                                53 
##                                           Grade 1 
##                                               436 
##                                           Grade 2 
##                                               507 
##                                           Grade 3 
##                                               524 
##                                           Grade 4 
##                                               501 
##                                           Grade 5 
##                                              2631 
##                                           Grade 6 
##                                               395 
##                                           Grade 7 
##                                               462 
##                                           Grade 8 
##                                               408 
##                                           Grade 9 
##                                               309 
##                                          Grade 10 
##                                               208 
##                                          Grade 11 
##                                                99 
##                                          Grade 12 
##                                                98 
##                         University / Not Graduate 
##                                                66 
##                   University / Graduate or higher 
##                                                25 
##                                     Below primary 
##                                                 2 
##                                              <NA> 
##                                              9316 
## [1] "Inspect value labels and relabel as necessary"
## Never Attended School or Only Attended Pre-School 
##                                                 1 
##                                           Grade 1 
##                                                 2 
##                                           Grade 2 
##                                                 3 
##                                           Grade 3 
##                                                 4 
##                                           Grade 4 
##                                                 5 
##                                           Grade 5 
##                                                 6 
##                                           Grade 6 
##                                                 7 
##                                           Grade 7 
##                                                 8 
##                                           Grade 8 
##                                                 9 
##                                           Grade 9 
##                                                10 
##                                          Grade 10 
##                                                11 
##                                          Grade 11 
##                                                12 
##                                          Grade 12 
##                                                13 
##                         University / Not Graduate 
##                                                14 
##                   University / Graduate or higher 
##                                                15 
##                                     Below primary 
##                                                16

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('q204_gender', 'q203_age', 'q213_edu') ##!!! Replace with candidate categorical demo vars

# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, 
                           keyVars = selectedKeyVars, 
                           hhId = selectedHouseholdID)
                           
sdcInitial
## The input dataset consists of 16040 rows and 35 variables.
##   --> Categorical key variables: q204_gender, q203_age, q213_edu
##   --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size            Size of smallest (>0)     
##   q204_gender                    4  (4)  5340.667 (5340.667)                     1  (1)
##      q203_age                   82 (82)   197.568  (197.568)                     2  (2)
##      q213_edu                   11 (11)  1048.700 (1048.700)                    33 (33)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 0 (0.000%)
##   - 3-anonymity: 0 (0.000%)
##   - 5-anonymity: 9 (0.056%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! No open-ends

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)