rm(list=ls(all=t))
filename <- "Section_1" # !!!Update filename
functions_vers <- "functions_1.8.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!No Direct PII
# !!!No Direct PII - team
locvars <- c("ec_s1q9")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## ec_s1q9. Q210: In which municipality is your school located? Saang munisipyo naroon ang
## 1 10 105 107 110 111 114 116 18 184 197 200 201 213 216 227 229 230 233 234 241 242 244 253 255 264 266
## 1 128 231 18 5 444 1 10 127 1 3 1 1 2 1 3 1 27 1 1 4 2 1 4 19 9 1
## 268 271 272 274 284 294 297 304 309 310 313 317 320 327 334 335 338 339 353 354 356 358 364 367 368 369 37
## 1 1 70 1 2 2 1 2 1 241 14 19 1 48 1 1 6 5 87 2 1 2 1 4 63 1 3
## 370 371 372 373 374 376 378 379 38 380 383 384 385 386 387 390 393 396 397 398 400 405 409 413 414 415 422
## 75 3 82 102 2 1 1 41 1 1 1 3 2 291 75 146 1 1 156 107 1 2 2 140 23 60 2
## 423 424 425 429 430 432 433 438 439 443 449 45 46 461 480 481 488 491 492 497 509 51 514 519 523 527 534
## 31 1 110 92 34 94 2 3 1 30 1 96 4 3 1 76 127 1 88 84 53 5 19 106 32 2 1
## 54 541 543 545 571 599 60 607 61 67 72 91
## 360 2 1 1 31 22 1 34 3 2 2 7
## [1] "Frequency table after encoding"
## ec_s1q9. Q210: In which municipality is your school located? Saang munisipyo naroon ang
## 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
## 94 31 3 76 84 1 1 1 1 1 5 92 1 1 6 2 75 1 70 4 106 1 2 19 1 1 2
## 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
## 22 3 146 2 1 1 53 2 96 1 32 2 1 34 2 156 1 1 1 3 23 127 107 5 1 1 82
## 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873
## 1 2 10 1 87 1 3 1 27 14 231 360 60 140 127 31 1 2 7 110 1 48 41 102 2 1 3
## 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
## 1 1 1 1 88 4 5 4 2 1 1 1 19 1 2 291 2 2 75 19 3 1 2 1 128 34 444
## 901 902 903 904 905 906 907 908 909 910 911 912
## 63 4 2 9 241 1 2 30 3 18 3 1
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
mydata$ec_s1q13<- as.factor(mydata$ec_s1q13)
mydata$ec_s1q13=factor(mydata$ec_s1q13,
levels=levels(mydata$ec_s1q13),
labels=c("-998", "1-6", "1-6", "1-6", "7-10","7-10","11-12","11-12","14", "17", "23","24","25 or more", "96"),
ordered=F)
mydata$ec_s1q15<- as.factor(mydata$ec_s1q15)
mydata$ec_s1q15=factor(mydata$ec_s1q15,
levels=levels(mydata$ec_s1q15),
labels=c("-998", "1-6", "1-6", "1-6", "7-10","7-10","7-10", "7-10","11-12","11-12","13", "15", "16","17","19", "20","21","22","23","24","25","96"),
ordered=F)
# Top code open school days with large unusual numbers
mydata <- top_recode ("ec_s1q21", break_point=6, missing=c(888, 999999))
## [1] "Frequency table before encoding"
## ec_s1q21. Q221: In the past 7 days, how many days was your school open for teaching? Sa n
## 0 1 2 3 4 5 6 7 <NA>
## 1924 12 40 72 159 1909 30 12 355
## [1] "Frequency table after encoding"
## ec_s1q21. Q221: In the past 7 days, how many days was your school open for teaching? Sa n
## 0 1 2 3 4 5 6 or more <NA>
## 1924 12 40 72 159 1909 42 355
# Top code household composition variables with large and unusual numbers
mydata <- top_recode ("ec_s1q23", break_point=13, missing=c(888, 999999)) # Topcode cases with 13 or more siblings.
## [1] "Frequency table before encoding"
## ec_s1q23. How many siblings do you have that share at least a mother or father (regardless
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16
## 200 237 599 722 665 560 529 327 278 154 120 58 34 19 5 6
## [1] "Frequency table after encoding"
## ec_s1q23. How many siblings do you have that share at least a mother or father (regardless
## 0 1 2 3 4 5 6 7 8 9
## 200 237 599 722 665 560 529 327 278 154
## 10 11 12 13 or more
## 120 58 34 30
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("ec_s1q2",
"ec_s1q3",
"ec_s1q5",
"ec_s1q12",
"ec_s1q17",
"ec_s1q18",
"genderfix",
"ec_female")
capture_tables (indirect_PII)
# Recode those with very specific values.
break_activity <- c(-999, -998, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 22)
labels_activity <- c("Refused to answer"=1,
"Dont know"=2,
"Other"=3,
"Own child"=4,
"Other"=5,
"Other"=6,
"Other"=7,
"Other"=8,
"Grandchild"=9,
"Other"=10,
"Nephew/niece"=11,
"Other"=12,
"Other"=13,
"Other"=14,
"Other"=15,
"Other"=16,
"Other/Unrelated "=17)
mydata <- ordinal_recode (variable="ec_s1q2", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## ec_s1q2. IS 's:
## Self Own child Step-child Parent Sibling
## 28 4013 29 15 18
## Grandparent Grandchild Cousin Nephew/niece Son/daughter-in-law
## 26 271 4 67 14
## Brother/sister-in-law Parent-in-law Aunt/uncle Other relative Other/Unrelated
## 3 4 12 1 8
## recoded
## [-999,-998) [-998,1) [1,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,9) [9,10) [10,11) [11,12) [12,13) [13,15)
## 1 0 0 28 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 4013 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 29 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 15 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 18 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 26 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 271 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 4 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 67 0 0 0
## 11 0 0 0 0 0 0 0 0 0 0 0 14 0 0
## 12 0 0 0 0 0 0 0 0 0 0 0 0 3 0
## 13 0 0 0 0 0 0 0 0 0 0 0 0 0 4
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## recoded
## [15,16) [16,22) [22,1e+06)
## 1 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7 0 0 0
## 8 0 0 0
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 15 12 0 0
## 16 0 1 0
## 22 0 0 8
## [1] "Frequency table after encoding"
## ec_s1q2. IS 's:
## Other Own child Grandchild Nephew/niece Other/Unrelated
## 154 4013 271 67 8
## [1] "Inspect value labels and relabel as necessary"
## Refused to answer Dont know Other Own child Other Other
## 1 2 3 4 5 6
## Other Other Grandchild Other Nephew/niece Other
## 7 8 9 10 11 12
## Other Other Other Other Other/Unrelated
## 13 14 15 16 17
break_activity <- c(-999,-998,1,2,3,4,5,6)
labels_activity <- c("Refused to answer"=1,
"Dont know"=2,
"Private-Catholic"=3,
"Private-Non-Catholic"=4,
"Public"=5,
"Other"=6,
"Other"=7,
"Other"=8)
mydata <- ordinal_recode (variable="ec_s1q12", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## ec_s1q12. Q213: What type of school was this? Anong uri ng paaralan ito?
## Refused to answer Don't know Private - Catholic
## 1 8 136
## Private - Non-Catholic Public Technical or vocational school
## 57 4292 8
## ALS Other
## 9 2
## recoded
## [-999,-998) [-998,1) [1,2) [2,3) [3,4) [4,5) [5,6) [6,1e+06)
## -999 1 0 0 0 0 0 0 0
## -998 0 8 0 0 0 0 0 0
## 1 0 0 136 0 0 0 0 0
## 2 0 0 0 57 0 0 0 0
## 3 0 0 0 0 4292 0 0 0
## 4 0 0 0 0 0 8 0 0
## 5 0 0 0 0 0 0 9 0
## 6 0 0 0 0 0 0 0 2
## [1] "Frequency table after encoding"
## ec_s1q12. Q213: What type of school was this? Anong uri ng paaralan ito?
## Refused to answer Dont know Private-Catholic Private-Non-Catholic Public
## 1 8 136 57 4292
## Other
## 19
## [1] "Inspect value labels and relabel as necessary"
## Refused to answer Dont know Private-Catholic Private-Non-Catholic Public
## 1 2 3 4 5
## Other Other Other
## 6 7 8
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('ec_s1q3', 'ec_s1q5', 'ec_s1q4') ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial
## The input dataset consists of 4513 rows and 28 variables.
## --> Categorical key variables: ec_s1q3, ec_s1q5, ec_s1q4
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## ec_s1q3 2 (2) 2256.500 (2256.500) 2129 (2129)
## ec_s1q5 22 (22) 205.136 (205.136) 1 (1)
## ec_s1q4 9 (9) 563.875 (563.875) 417 (417)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 30 (0.665%)
## - 3-anonymity: 50 (1.108%)
## - 5-anonymity: 92 (2.039%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 30 x 3
## ec_s1q3 ec_s1q5 ec_s1q4
## <dbl+lbl> <dbl> <dbl>
## 1 0 [Female] 96 17
## 2 0 [Female] 8 12
## 3 0 [Female] 2 13
## 4 1 [Male] 3 15
## 5 1 [Male] 96 15
## 6 1 [Male] 23 17
## 7 0 [Female] 3 16
## 8 1 [Male] 13 16
## 9 0 [Female] 23 10
## 10 0 [Female] 21 17
## # ... with 20 more rows
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first element will be used
## ec_s1q3 ec_s1q5 ec_s1q4
## 489 0 NA 17
## 796 0 NA 12
## 919 0 NA 13
## 924 1 NA 15
## 932 1 NA 15
## 1084 1 NA 17
## 1406 0 NA 16
## 1461 1 NA 16
## 1820 0 NA 10
## 2104 0 NA 17
## 2303 1 NA 15
## 2419 1 NA 15
## 2605 1 NA 16
## 2627 1 NA 13
## 2631 0 NA 10
## 2797 0 NA 16
## 2826 1 NA 17
## 2846 1 NA 17
## 2857 0 NA 12
## 3158 1 NA 10
## 3301 0 NA 15
## 3302 1 NA 13
## 3351 1 NA 16
## 3730 1 NA 17
## 3769 1 NA 10
## 4136 0 NA 12
## 4138 0 NA 12
## 4246 1 NA 10
## 4399 0 NA 16
## 4443 1 NA 17
mydata [notAnon,"ec_s1q3"] <- NA
# !!! Identify open-end variables here:
open_ends <- c("ec_s1q6",
"ec_s1q14",
"ec_s1q16",
"ec_s1q19")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
# !!!No variables to be deleted or redacted
# !!!No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)