rm(list=ls(all=t))
filename <- "Section_3" # !!!Update filename
functions_vers <- "functions_1.8.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!No Direct PII
# !!!No Direct PII - team
# !!!No Small locations
# Top code high income to the 99.5 percentile
percentile_99.5 <-floor(quantile(na.exclude(mydata$eh_s3q14)[na.exclude(mydata$eh_s3q14)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="eh_s3q14", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## eh_s3q14. Q174: In the last 4 weeks, how much in total has been spent on treatment because
## -998 0 1 2 3 4 5 6 7 10 12 13 14 15 16 18 19 20 21
## 17 24 2 1 2 1 10 7 7 34 10 3 4 15 5 10 1 88 8
## 22 24 25 27 28 30 32 33 34 35 36 38 39 40 42 44 45 46 47
## 4 14 17 3 4 74 9 1 1 12 8 4 1 33 5 1 6 3 1
## 48 50 51 52 54 55 56 57 58 60 62 63 64 65 66 67 70 72 74
## 8 129 2 4 2 4 5 1 3 32 1 3 1 4 4 1 11 4 2
## 75 76 78 79 80 81 82 83 84 85 88 89 90 93 94 95 96 98 100
## 10 1 2 3 13 1 1 2 3 2 1 1 15 1 1 1 2 2 162
## 102 103 105 106 108 109 110 113 115 116 118 120 124 125 126 128 130 135 136
## 1 2 3 3 4 1 7 1 4 1 1 21 1 2 2 2 6 2 3
## 139 140 141 144 145 147 148 150 153 156 158 159 160 165 166 170 175 180 182
## 1 6 1 1 2 2 1 51 1 1 1 1 6 1 2 2 1 11 1
## 185 186 188 190 195 200 203 206 208 210 211 215 220 222 224 225 230 231 235
## 1 1 1 2 1 117 1 1 1 4 1 1 2 1 1 1 1 1 1
## 240 241 245 250 256 260 265 270 275 280 283 286 287 290 300 310 314 318 320
## 5 1 1 19 2 3 2 4 2 4 1 1 1 3 88 3 1 2 4
## 330 335 340 345 350 360 361 363 365 368 371 375 378 380 384 390 400 410 420
## 1 1 1 1 12 2 2 1 1 1 1 1 1 1 1 1 24 1 2
## 448 450 456 460 462 466 467 480 498 500 501 510 517 520 529 534 550 560 570
## 1 9 1 1 1 1 1 3 1 108 1 1 1 1 1 1 3 4 1
## 600 608 610 620 630 635 640 650 690 700 710 720 733 740 750 760 770 772 800
## 29 1 1 1 2 1 2 4 1 22 1 1 1 1 7 1 1 1 17
## 850 862 896 900 930 950 960 1000 1005 1010 1035 1041 1050 1064 1075 1100 1150 1170 1200
## 2 1 1 6 1 4 1 78 1 2 1 1 1 1 1 1 1 1 16
## 1250 1270 1275 1280 1300 1316 1340 1342 1350 1373 1400 1420 1500 1540 1550 1600 1620 1700 1712
## 1 1 1 1 6 1 1 1 2 1 2 1 44 1 1 8 1 3 1
## 1720 1740 1741 1750 1760 1800 1805 1842 1890 1894 1900 2000 2040 2060 2100 2150 2200 2300 2400
## 1 1 1 1 1 7 1 1 1 1 2 43 1 1 3 1 3 2 2
## 2450 2456 2500 2600 2650 2696 2700 2740 2750 2800 2868 2880 2900 3000 3060 3100 3200 3450 3500
## 1 1 16 2 2 1 2 1 1 4 1 1 1 32 1 1 1 1 2
## 3600 3800 4000 4150 4220 4500 4600 5000 5140 5400 5430 5500 6000 6500 7000 7300 7980 8000 8800
## 1 1 16 1 1 3 1 27 1 1 1 1 8 1 6 2 1 2 1
## 9000 10000 11600 11730 12000 12748 13600 13650 15000 17000 19145 20000 22000 30000 35000 36000 40000 45000 50000
## 2 8 1 1 4 1 1 1 5 1 1 2 1 1 2 1 1 1 1
## 2e+05 <NA>
## 1 12489
## [1] "Frequency table after encoding"
## eh_s3q14. Q174: In the last 4 weeks, how much in total has been spent on treatment because
## -998 0 1 2 3 4 5 6
## 17 24 2 1 2 1 10 7
## 7 10 12 13 14 15 16 18
## 7 34 10 3 4 15 5 10
## 19 20 21 22 24 25 27 28
## 1 88 8 4 14 17 3 4
## 30 32 33 34 35 36 38 39
## 74 9 1 1 12 8 4 1
## 40 42 44 45 46 47 48 50
## 33 5 1 6 3 1 8 129
## 51 52 54 55 56 57 58 60
## 2 4 2 4 5 1 3 32
## 62 63 64 65 66 67 70 72
## 1 3 1 4 4 1 11 4
## 74 75 76 78 79 80 81 82
## 2 10 1 2 3 13 1 1
## 83 84 85 88 89 90 93 94
## 2 3 2 1 1 15 1 1
## 95 96 98 100 102 103 105 106
## 1 2 2 162 1 2 3 3
## 108 109 110 113 115 116 118 120
## 4 1 7 1 4 1 1 21
## 124 125 126 128 130 135 136 139
## 1 2 2 2 6 2 3 1
## 140 141 144 145 147 148 150 153
## 6 1 1 2 2 1 51 1
## 156 158 159 160 165 166 170 175
## 1 1 1 6 1 2 2 1
## 180 182 185 186 188 190 195 200
## 11 1 1 1 1 2 1 117
## 203 206 208 210 211 215 220 222
## 1 1 1 4 1 1 2 1
## 224 225 230 231 235 240 241 245
## 1 1 1 1 1 5 1 1
## 250 256 260 265 270 275 280 283
## 19 2 3 2 4 2 4 1
## 286 287 290 300 310 314 318 320
## 1 1 3 88 3 1 2 4
## 330 335 340 345 350 360 361 363
## 1 1 1 1 12 2 2 1
## 365 368 371 375 378 380 384 390
## 1 1 1 1 1 1 1 1
## 400 410 420 448 450 456 460 462
## 24 1 2 1 9 1 1 1
## 466 467 480 498 500 501 510 517
## 1 1 3 1 108 1 1 1
## 520 529 534 550 560 570 600 608
## 1 1 1 3 4 1 29 1
## 610 620 630 635 640 650 690 700
## 1 1 2 1 2 4 1 22
## 710 720 733 740 750 760 770 772
## 1 1 1 1 7 1 1 1
## 800 850 862 896 900 930 950 960
## 17 2 1 1 6 1 4 1
## 1000 1005 1010 1035 1041 1050 1064 1075
## 78 1 2 1 1 1 1 1
## 1100 1150 1170 1200 1250 1270 1275 1280
## 1 1 1 16 1 1 1 1
## 1300 1316 1340 1342 1350 1373 1400 1420
## 6 1 1 1 2 1 2 1
## 1500 1540 1550 1600 1620 1700 1712 1720
## 44 1 1 8 1 3 1 1
## 1740 1741 1750 1760 1800 1805 1842 1890
## 1 1 1 1 7 1 1 1
## 1894 1900 2000 2040 2060 2100 2150 2200
## 1 2 43 1 1 3 1 3
## 2300 2400 2450 2456 2500 2600 2650 2696
## 2 2 1 1 16 2 2 1
## 2700 2740 2750 2800 2868 2880 2900 3000
## 2 1 1 4 1 1 1 32
## 3060 3100 3200 3450 3500 3600 3800 4000
## 1 1 1 1 2 1 1 16
## 4150 4220 4500 4600 5000 5140 5400 5430
## 1 1 3 1 27 1 1 1
## 5500 6000 6500 7000 7300 7980 8000 8800
## 1 8 1 6 2 1 2 1
## 9000 10000 11600 11730 12000 12748 13600 13650
## 2 8 1 1 4 1 1 1
## 15000 17000 19145 19610 or more <NA>
## 5 1 1 11 12489
break_edu <- c(-998,1,2,4,6,7,8,99)
labels_edu <- c("-998" = 1,
"1" = 2,
"2" = 3,
"4 or 5" = 4,
"6" = 5,
"7" = 6,
"8" = 7,
"99"= 8)
mydata <- ordinal_recode (variable="eh_s3q5", break_points=break_edu, missing=999999, value_labels=labels_edu)
## [1] "Frequency table before encoding"
## eh_s3q5. Q117: What is 's marital status? sBq40: Ano ang civil status ni \
## -998 1 2 4 5 6 7 8 99
## 1 3677 163 8 84 314 956 9376 2
## recoded
## [-998,1) [1,2) [2,4) [4,6) [6,7) [7,8) [8,99) [99,1e+06)
## -998 1 0 0 0 0 0 0 0
## 1 0 3677 0 0 0 0 0 0
## 2 0 0 163 0 0 0 0 0
## 4 0 0 0 8 0 0 0 0
## 5 0 0 0 84 0 0 0 0
## 6 0 0 0 0 314 0 0 0
## 7 0 0 0 0 0 956 0 0
## 8 0 0 0 0 0 0 9376 0
## 99 0 0 0 0 0 0 0 2
## [1] "Frequency table after encoding"
## eh_s3q5. Q117: What is 's marital status? sBq40: Ano ang civil status ni \
## -998 1 2 4 or 5 6 7 8 99
## 1 3677 163 92 314 956 9376 2
## [1] "Inspect value labels and relabel as necessary"
## -998 1 2 4 or 5 6 7 8 99
## 1 2 3 4 5 6 7 8
# !!!Insufficient demographic data
# !!! Identify open-end variables here:
open_ends <- c("eh_s3q6",
"eh_s3q10")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
mydata$eh_s3q6[8975] <- "other"
mydata$eh_s3q6[13940] <- "other"
# !!!No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)