rm(list=ls(all=t))

Setup filenames

filename <- "Section_1" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

# !!!No Direct PII 

Direct PII-team: Encode field team names

# !!!No Direct PII - team

Small locations: Encode locations with pop <100,000 using random large numbers

locvars <- c("ec_s1q9") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## ec_s1q9. Q210: In which municipality is your school located?  Saang munisipyo naroon ang 
##   1  10 105 107 110 111 114 116  18 184 197 200 201 213 216 227 229 230 233 234 241 242 244 253 255 264 266 
##   1 128 231  18   5 444   1  10 127   1   3   1   1   2   1   3   1  27   1   1   4   2   1   4  19   9   1 
## 268 271 272 274 284 294 297 304 309 310 313 317 320 327 334 335 338 339 353 354 356 358 364 367 368 369  37 
##   1   1  70   1   2   2   1   2   1 241  14  19   1  48   1   1   6   5  87   2   1   2   1   4  63   1   3 
## 370 371 372 373 374 376 378 379  38 380 383 384 385 386 387 390 393 396 397 398 400 405 409 413 414 415 422 
##  75   3  82 102   2   1   1  41   1   1   1   3   2 291  75 146   1   1 156 107   1   2   2 140  23  60   2 
## 423 424 425 429 430 432 433 438 439 443 449  45  46 461 480 481 488 491 492 497 509  51 514 519 523 527 534 
##  31   1 110  92  34  94   2   3   1  30   1  96   4   3   1  76 127   1  88  84  53   5  19 106  32   2   1 
##  54 541 543 545 571 599  60 607  61  67  72  91 
## 360   2   1   1  31  22   1  34   3   2   2   7 
## [1] "Frequency table after encoding"
## ec_s1q9. Q210: In which municipality is your school located?  Saang munisipyo naroon ang 
## 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 
##  94  31   3  76  84   1   1   1   1   1   5  92   1   1   6   2  75   1  70   4 106   1   2  19   1   1   2 
## 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 
##  22   3 146   2   1   1  53   2  96   1  32   2   1  34   2 156   1   1   1   3  23 127 107   5   1   1  82 
## 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 
##   1   2  10   1  87   1   3   1  27  14 231 360  60 140 127  31   1   2   7 110   1  48  41 102   2   1   3 
## 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 
##   1   1   1   1  88   4   5   4   2   1   1   1  19   1   2 291   2   2  75  19   3   1   2   1 128  34 444 
## 901 902 903 904 905 906 907 908 909 910 911 912 
##  63   4   2   9 241   1   2  30   3  18   3   1

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

mydata$ec_s1q13<- as.factor(mydata$ec_s1q13)

mydata$ec_s1q13=factor(mydata$ec_s1q13,
                       levels=levels(mydata$ec_s1q13),
                       labels=c("-998", "1-6", "1-6", "1-6", "7-10","7-10","11-12","11-12","14", "17", "23","24","25 or more", "96"),
                       ordered=F)
mydata$ec_s1q15<- as.factor(mydata$ec_s1q15)

mydata$ec_s1q15=factor(mydata$ec_s1q15,
                       levels=levels(mydata$ec_s1q15),
                       labels=c("-998", "1-6", "1-6", "1-6", "7-10","7-10","7-10", "7-10","11-12","11-12","13", "15", "16","17","19", "20","21","22","23","24","25","96"),
                       ordered=F)

# Top code open school days with large unusual numbers

mydata <- top_recode ("ec_s1q21", break_point=6, missing=c(888, 999999)) 
## [1] "Frequency table before encoding"
## ec_s1q21. Q221: In the past 7 days, how many days was your school open for teaching?  Sa n
##    0    1    2    3    4    5    6    7 <NA> 
## 1924   12   40   72  159 1909   30   12  355

## [1] "Frequency table after encoding"
## ec_s1q21. Q221: In the past 7 days, how many days was your school open for teaching?  Sa n
##         0         1         2         3         4         5 6 or more      <NA> 
##      1924        12        40        72       159      1909        42       355

# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("ec_s1q23", break_point=13, missing=c(888, 999999)) # Topcode cases with 13 or more siblings. 
## [1] "Frequency table before encoding"
## ec_s1q23. How many siblings do you have that share at least a mother or father (regardless
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  16 
## 200 237 599 722 665 560 529 327 278 154 120  58  34  19   5   6

## [1] "Frequency table after encoding"
## ec_s1q23. How many siblings do you have that share at least a mother or father (regardless
##          0          1          2          3          4          5          6          7          8          9 
##        200        237        599        722        665        560        529        327        278        154 
##         10         11         12 13 or more 
##        120         58         34         30

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("ec_s1q2",
                  "ec_s1q3",
                  "ec_s1q5",
                  "ec_s1q12",
                  "ec_s1q17",
                  "ec_s1q18",
                  "genderfix",
                  "ec_female")

capture_tables (indirect_PII)

# Recode those with very specific values. 

break_activity <- c(-999,   -998,   1,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 15, 16, 22)
labels_activity <- c("Refused to answer"=1,
                     "Dont know"=2,
                     "Other"=3,
                     "Own child"=4,
                     "Other"=5,
                     "Other"=6,
                     "Other"=7,
                     "Other"=8,
                     "Grandchild"=9,
                     "Other"=10,
                     "Nephew/niece"=11,
                     "Other"=12,
                     "Other"=13,
                     "Other"=14,
                     "Other"=15,
                     "Other"=16,
                     "Other/Unrelated "=17)
mydata <- ordinal_recode (variable="ec_s1q2", break_points=break_activity, missing=999999, value_labels=labels_activity)

## [1] "Frequency table before encoding"
## ec_s1q2.  IS 's:
##                  Self             Own child            Step-child                Parent               Sibling 
##                    28                  4013                    29                    15                    18 
##           Grandparent            Grandchild                Cousin          Nephew/niece   Son/daughter-in-law 
##                    26                   271                     4                    67                    14 
## Brother/sister-in-law         Parent-in-law            Aunt/uncle        Other relative       Other/Unrelated 
##                     3                     4                    12                     1                     8 
##     recoded
##      [-999,-998) [-998,1) [1,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,9) [9,10) [10,11) [11,12) [12,13) [13,15)
##   1            0        0    28     0     0     0     0     0     0      0       0       0       0       0
##   3            0        0     0  4013     0     0     0     0     0      0       0       0       0       0
##   4            0        0     0     0    29     0     0     0     0      0       0       0       0       0
##   5            0        0     0     0     0    15     0     0     0      0       0       0       0       0
##   6            0        0     0     0     0     0    18     0     0      0       0       0       0       0
##   7            0        0     0     0     0     0     0    26     0      0       0       0       0       0
##   8            0        0     0     0     0     0     0     0   271      0       0       0       0       0
##   9            0        0     0     0     0     0     0     0     0      4       0       0       0       0
##   10           0        0     0     0     0     0     0     0     0      0      67       0       0       0
##   11           0        0     0     0     0     0     0     0     0      0       0      14       0       0
##   12           0        0     0     0     0     0     0     0     0      0       0       0       3       0
##   13           0        0     0     0     0     0     0     0     0      0       0       0       0       4
##   15           0        0     0     0     0     0     0     0     0      0       0       0       0       0
##   16           0        0     0     0     0     0     0     0     0      0       0       0       0       0
##   22           0        0     0     0     0     0     0     0     0      0       0       0       0       0
##     recoded
##      [15,16) [16,22) [22,1e+06)
##   1        0       0          0
##   3        0       0          0
##   4        0       0          0
##   5        0       0          0
##   6        0       0          0
##   7        0       0          0
##   8        0       0          0
##   9        0       0          0
##   10       0       0          0
##   11       0       0          0
##   12       0       0          0
##   13       0       0          0
##   15      12       0          0
##   16       0       1          0
##   22       0       0          8
## [1] "Frequency table after encoding"
## ec_s1q2.  IS 's:
##            Other        Own child       Grandchild     Nephew/niece Other/Unrelated  
##              154             4013              271               67                8 
## [1] "Inspect value labels and relabel as necessary"
## Refused to answer         Dont know             Other         Own child             Other             Other 
##                 1                 2                 3                 4                 5                 6 
##             Other             Other        Grandchild             Other      Nephew/niece             Other 
##                 7                 8                 9                10                11                12 
##             Other             Other             Other             Other  Other/Unrelated  
##                13                14                15                16                17
break_activity <- c(-999,-998,1,2,3,4,5,6)
labels_activity <- c("Refused to answer"=1,
                     "Dont know"=2,
                     "Private-Catholic"=3,
                     "Private-Non-Catholic"=4,
                     "Public"=5,
                     "Other"=6,
                     "Other"=7,
                     "Other"=8) 

mydata <- ordinal_recode (variable="ec_s1q12", break_points=break_activity, missing=999999, value_labels=labels_activity)

## [1] "Frequency table before encoding"
## ec_s1q12. Q213: What type of school was this?  Anong uri ng paaralan ito?
##              Refused to answer                     Don't know             Private - Catholic 
##                              1                              8                            136 
##         Private - Non-Catholic                         Public Technical or vocational school 
##                             57                           4292                              8 
##                            ALS                          Other 
##                              9                              2 
##       recoded
##        [-999,-998) [-998,1) [1,2) [2,3) [3,4) [4,5) [5,6) [6,1e+06)
##   -999           1        0     0     0     0     0     0         0
##   -998           0        8     0     0     0     0     0         0
##   1              0        0   136     0     0     0     0         0
##   2              0        0     0    57     0     0     0         0
##   3              0        0     0     0  4292     0     0         0
##   4              0        0     0     0     0     8     0         0
##   5              0        0     0     0     0     0     9         0
##   6              0        0     0     0     0     0     0         2
## [1] "Frequency table after encoding"
## ec_s1q12. Q213: What type of school was this?  Anong uri ng paaralan ito?
##    Refused to answer            Dont know     Private-Catholic Private-Non-Catholic               Public 
##                    1                    8                  136                   57                 4292 
##                Other 
##                   19 
## [1] "Inspect value labels and relabel as necessary"
##    Refused to answer            Dont know     Private-Catholic Private-Non-Catholic               Public 
##                    1                    2                    3                    4                    5 
##                Other                Other                Other 
##                    6                    7                    8

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('ec_s1q3', 'ec_s1q5', 'ec_s1q4') ##!!! Replace with candidate categorical demo vars


# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 4513 rows and 28 variables.
##   --> Categorical key variables: ec_s1q3, ec_s1q5, ec_s1q4
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##  Key Variable Number of categories      Mean size            Size of smallest (>0)       
##       ec_s1q3                    2  (2)  2256.500 (2256.500)                  2129 (2129)
##       ec_s1q5                   22 (22)   205.136  (205.136)                     1    (1)
##       ec_s1q4                    9  (9)   563.875  (563.875)                   417  (417)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 30 (0.665%)
##   - 3-anonymity: 50 (1.108%)
##   - 5-anonymity: 92 (2.039%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 30 x 3
##       ec_s1q3 ec_s1q5 ec_s1q4
##     <dbl+lbl>   <dbl>   <dbl>
##  1 0 [Female]      96      17
##  2 0 [Female]       8      12
##  3 0 [Female]       2      13
##  4 1 [Male]         3      15
##  5 1 [Male]        96      15
##  6 1 [Male]        23      17
##  7 0 [Female]       3      16
##  8 1 [Male]        13      16
##  9 0 [Female]      23      10
## 10 0 [Female]      21      17
## # ... with 20 more rows
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
##      ec_s1q3 ec_s1q5 ec_s1q4
## 489        0      NA      17
## 796        0      NA      12
## 919        0      NA      13
## 924        1      NA      15
## 932        1      NA      15
## 1084       1      NA      17
## 1406       0      NA      16
## 1461       1      NA      16
## 1820       0      NA      10
## 2104       0      NA      17
## 2303       1      NA      15
## 2419       1      NA      15
## 2605       1      NA      16
## 2627       1      NA      13
## 2631       0      NA      10
## 2797       0      NA      16
## 2826       1      NA      17
## 2846       1      NA      17
## 2857       0      NA      12
## 3158       1      NA      10
## 3301       0      NA      15
## 3302       1      NA      13
## 3351       1      NA      16
## 3730       1      NA      17
## 3769       1      NA      10
## 4136       0      NA      12
## 4138       0      NA      12
## 4246       1      NA      10
## 4399       0      NA      16
## 4443       1      NA      17
mydata [notAnon,"ec_s1q3"] <- NA

Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 

open_ends <- c("ec_s1q6",
              "ec_s1q14",
              "ec_s1q16",
              "ec_s1q19")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 
# !!!No variables to be deleted or redacted 

GPS data: Displace

# !!!No GPS data

Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)