rm(list=ls(all=t))
filename <- "DOLE" # !!!Update filename
functions_vers <- "functions_1.8.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!No Direct PII
# !!!No Direct PII - team
# !!!Include relevant variables, but check their population size first to confirm they are <100,000
locvars <- c("Municipality")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## Municipality. Municipality
## Abucay Agno Amulung Anda
## 70 56 56 28
## Bugallon Calabanga Calasiao Canaman
## 28 28 14 14
## Candelaria Cauayan City Enrile General Emilio Aguinaldo
## 14 28 28 14
## Jala-Jala Jones Jose Panganiban Labo
## 28 98 42 14
## Magarao Malinao Manito Mariveles
## 28 98 14 98
## Naga City Pagsanjan Pasacao Pila
## 14 70 28 14
## Pilar Pililla Polangui Sampaloc
## 56 28 28 14
## San Carlos City San Mateo San Nicolas Sorsogon City
## 14 14 14 14
## Tanay Tinambac
## 28 14
## [1] "Frequency table after encoding"
## Municipality. Municipality
## 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 56 28 98 28 14 14 28 14 14 42 28 98 28 56 28 70 14 70 14 14 98 28 28 28 14 56 14
## 487 488 489 490 491 492 493
## 14 14 28 14 14 14 28
# Top code high income to the 99.5 percentile
mydata$Assetvalue <- as.numeric(mydata$Assetvalue)
## Warning: NAs introduced by coercion
percentile_99.5 <- floor(quantile(na.exclude(mydata$Assetvalue)[na.exclude(mydata$Assetvalue)!=-97], probs = c(0.995)))
mydata <- top_recode (variable="Assetvalue", break_point=percentile_99.5, missing=-97)
## [1] "Frequency table before encoding"
## Assetvalue.
## 1181 9167.5 9295 9378 9494 9498.5 9514.3 9535 9624 9700 9728 9773
## 1 1 1 1 3 1 1 1 2 1 1 1
## 9786.5 9795 9810 9811.75 9815 9820 9826.5 9838.25 9846 9857.5 9859 9867
## 10 1 1 13 1 3 2 1 1 1 2 1
## 9875 9883.3 9885 9887 9890 9895.5 9896 9900 9902 9910 9917 9919.5
## 2 2 1 1 4 3 2 2 1 2 1 1
## 9920 9920.8 9921.25 9925 9926.32 9930 9934 9936 9937 9938.5 9943 9944
## 1 1 2 1 1 3 3 28 1 2 1 1
## 9945 9945.2 9945.4 9949 9950 9960 9964 9967.5 9972.5 9974 9974.5 9977.65
## 1 2 1 1 97 2 1 3 1 1 3 2
## 9978 9986 9990 9991.25 9993.7 9994 9995 9999.65 9999.75 10000 10000.65 10001
## 1 2 2 1 1 2 2 8 2 625 1 1
## 10003 10004 10005 10007.5 10009 10010 10011 10012 10012.25 10019.5 10020 10021
## 1 3 1 1 3 1 1 26 1 2 1 1
## 10022 10023.9 10024 10029 10029.86 10032 10034 10035 10045 10050 10057 10057.5
## 2 1 4 1 1 1 1 1 1 2 2 1
## 10059 10060 10067.28 10071 10072.5 10075 10082 10085 10090 10091.1 10094.75 10095.3
## 1 1 1 1 1 1 1 1 4 1 11 1
## 10100 10101 10121 10128 10130.5 10132 10137.75 10150 10157 10171 10199 10220
## 1 34 2 1 5 1 1 1 1 1 1 4
## 10223 10254 10257 10280 10318.5 10325.45 10336 10353 10395 10400 10408.04 10412
## 1 1 1 1 1 1 1 1 1 9 1 1
## 10420 10429.5 10450 10450.5 10475 10560 10577.64 10607.9 10631 10691 10702 10737
## 4 1 1 1 1 1 1 1 1 1 1 1
## 10745 10746 10813 10932 10934.5 10938.9 10961 11575 11700 11787.2 11935 12556.4
## 1 1 1 1 1 1 1 2 3 1 2 1
## 13997 14260 19500 <NA>
## 1 2 1 73
## [1] "Frequency table after encoding"
## Assetvalue. 11935
## 1181 9167.5 9295 9378 9494 9498.5 9514.3
## 1 1 1 1 3 1 1
## 9535 9624 9700 9728 9773 9786.5 9795
## 1 2 1 1 1 10 1
## 9810 9811.75 9815 9820 9826.5 9838.25 9846
## 1 13 1 3 2 1 1
## 9857.5 9859 9867 9875 9883.3 9885 9887
## 1 2 1 2 2 1 1
## 9890 9895.5 9896 9900 9902 9910 9917
## 4 3 2 2 1 2 1
## 9919.5 9920 9920.8 9921.25 9925 9926.32 9930
## 1 1 1 2 1 1 3
## 9934 9936 9937 9938.5 9943 9944 9945
## 3 28 1 2 1 1 1
## 9945.2 9945.4 9949 9950 9960 9964 9967.5
## 2 1 1 97 2 1 3
## 9972.5 9974 9974.5 9977.65 9978 9986 9990
## 1 1 3 2 1 2 2
## 9991.25 9993.7 9994 9995 9999.65 9999.75 10000
## 1 1 2 2 8 2 625
## 10000.65 10001 10003 10004 10005 10007.5 10009
## 1 1 1 3 1 1 3
## 10010 10011 10012 10012.25 10019.5 10020 10021
## 1 1 26 1 2 1 1
## 10022 10023.9 10024 10029 10029.86 10032 10034
## 2 1 4 1 1 1 1
## 10035 10045 10050 10057 10057.5 10059 10060
## 1 1 2 2 1 1 1
## 10067.28 10071 10072.5 10075 10082 10085 10090
## 1 1 1 1 1 1 4
## 10091.1 10094.75 10095.3 10100 10101 10121 10128
## 1 11 1 1 34 2 1
## 10130.5 10132 10137.75 10150 10157 10171 10199
## 5 1 1 1 1 1 1
## 10220 10223 10254 10257 10280 10318.5 10325.45
## 4 1 1 1 1 1 1
## 10336 10353 10395 10400 10408.04 10412 10420
## 1 1 1 9 1 1 4
## 10429.5 10450 10450.5 10475 10560 10577.64 10607.9
## 1 1 1 1 1 1 1
## 10631 10691 10702 10737 10745 10746 10813
## 1 1 1 1 1 1 1
## 10932 10934.5 10938.9 10961 11575 11700 11787.2
## 1 1 1 1 2 3 1
## 11935 or more <NA>
## 7 73
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("Assettype",
"Compliance",
"Training1Description",
"Training2Description",
"Training3Decription",
"assettype_clean",
"assetcat",
"assets_exp")
capture_tables (indirect_PII)
# Recode those with very specific values.
break_activity <- c(1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25)
labels_activity <- c("Sari Sari business/equipment"=1,
"Prepared-Food business/equipment (rice, vegetables, fish, etc)"=2,
"Fishing business/equipment"=3,
"Others"=4,
"Others"=5,
"Others"=6,
"Others"=7,
"Others"=8,
"Others"=9,
"Others"=10,
"Others"=11,
"Others"=12,
"Others"=13,
"Others"=14,
"Others"=15,
"Others"=16,
"Tools and equipment"=17,
"Others"=18,
"Others"=19,
"Others"=20,
"Others"=21,
"Others"=22,
"Others"=23)
mydata2 <- ordinal_recode (variable="assetcat", break_points=break_activity, missing=999999, value_labels=labels_activity)
## [1] "Frequency table before encoding"
## assetcat. Asset category (KASAMA)
## 0
## 66
## Sari Sari business/equipment
## 355
## Prepared-Food business/equipment (rice, vegetables, fish, etc)
## 380
## Fishing business/equipment
## 66
## Merienda/ streetfood business/equipment
## 17
## Welding business/equipment
## 5
## Carpentry business/equipment
## 11
## Product manufacturing business/equipment
## 8
## Beauty care business/equipment
## 5
## Livestock raising business/equipment
## 17
## Poultry raising business/equipment
## 2
## Dry good business/equipment
## 4
## Tailoring business/equipment
## 16
## Automotive business/equipment
## 3
## Farming or farming-support business/equipment
## 25
## Vehicle driving business/equipment
## 10
## Masonry or construction business/equipment
## 3
## Tools and equipment
## 138
## Laundry
## 2
## Retail wear
## 9
## Ambulant vendor
## 1
## Paint shop
## 1
## Prepaid phone loading business
## 1
## Kit
## 3
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,10) [10,11) [11,12) [12,13) [13,14) [14,15) [15,16) [16,17)
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 355 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 380 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 66 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 11 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0
## 12 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0
## 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
## 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 24 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## recoded
## [17,18) [18,20) [20,21) [21,22) [22,23) [23,24) [24,25) [25,1e+06)
## 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0
## 17 3 0 0 0 0 0 0 0
## 18 0 138 0 0 0 0 0 0
## 20 0 0 2 0 0 0 0 0
## 21 0 0 0 9 0 0 0 0
## 22 0 0 0 0 1 0 0 0
## 23 0 0 0 0 0 1 0 0
## 24 0 0 0 0 0 0 1 0
## 25 0 0 0 0 0 0 0 3
## [1] "Frequency table after encoding"
## assetcat. Asset category (KASAMA)
## Sari Sari business/equipment
## 355
## Prepared-Food business/equipment (rice, vegetables, fish, etc)
## 380
## Fishing business/equipment
## 66
## Others
## 143
## Tools and equipment
## 138
## <NA>
## 66
## [1] "Inspect value labels and relabel as necessary"
## Sari Sari business/equipment
## 1
## Prepared-Food business/equipment (rice, vegetables, fish, etc)
## 2
## Fishing business/equipment
## 3
## Others
## 4
## Others
## 5
## Others
## 6
## Others
## 7
## Others
## 8
## Others
## 9
## Others
## 10
## Others
## 11
## Others
## 12
## Others
## 13
## Others
## 14
## Others
## 15
## Others
## 16
## Tools and equipment
## 17
## Others
## 18
## Others
## 19
## Others
## 20
## Others
## 21
## Others
## 22
## Others
## 23
mydata <- mydata[!names(mydata) %in% "Assettype"]
mydata <- mydata[!names(mydata) %in% "assettype_clean"]
# !!!Removed, as it contains sensitive information and there is another variable that contains this information at a more aggregated level.
# !!!Insufficient demographic data
# !!!No Open-ends
# !!!No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)