#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_8" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)
#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed

mydata$household_id <- zap_labels(mydata$household_id)

#'# Direct PII-team: Encode field team names
# !!!No Direct PII - team

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No Small locations

#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Top code high income/expense to the 99.5 percentile
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q7__1)[na.exclude(mydata$m_s8q7__1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q7__1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q8__1)[na.exclude(mydata$m_s8q8__1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q8__1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q7__2)[na.exclude(mydata$m_s8q7__2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q7__2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q8__2)[na.exclude(mydata$m_s8q8__2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q8__2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q7__3)[na.exclude(mydata$m_s8q7__3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q7__3", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q8__3)[na.exclude(mydata$m_s8q8__3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q8__3", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q16__1)[na.exclude(mydata$m_s8q16__1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q16__1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q17__1)[na.exclude(mydata$m_s8q17__1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q17__1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q27__1)[na.exclude(mydata$m_s8q27__1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q27__1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q28__1)[na.exclude(mydata$m_s8q28__1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q28__1", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q27__2)[na.exclude(mydata$m_s8q27__2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q27__2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$m_s8q27__3)[na.exclude(mydata$m_s8q27__3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="m_s8q27__3", break_point=percentile_99.5, missing=999999)

#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("m_s8q3__1",
                  "m_s8q3__2",
                  "m_s8q3__3",
                  "m_s8q12__1",
                  "m_s8q12__2",
                  "m_s8q21__1",
                  "m_s8q21__2",
                  "m_s8q21__3")

capture_tables (indirect_PII)

# Recode those with very specific values. 


break_ocup <- c(-999,-998,-888, 8, 15, 21, 24, 27, 29, 30, 34, 37, 43)
labels_ocup <- c("Refused to answer"=1,
                 "Don't know"=2,
                  "Other: Specify "=3,
                 "Other"=4,
                 "Other"=5,
                 "Food processing, wood working, garment and other craft and related trades workers"=6,
                 "Other"=7,
                 "Other"=8,
                 "Other"=9,
                 "Other"=10,
                 "Street and related sales and service workers"=11,
                 "Other"=12,
                 "Other"=13)
mydata <- ordinal_recode (variable="m_s8q3__1", break_points=break_ocup, missing=999999, value_labels=labels_ocup)



break_ocup <- c(-999,-998, -888, 21, 29, 30, 34)
labels_ocup <- c("Refused to answer"=1,
                  "Don't know"=2,
                 "Other: Specify"=3,
                 "Other"=4,
                 "Other"=5,
                 "Other"=6,
                 "Other"=7)
mydata <- ordinal_recode (variable="m_s8q3__2", break_points=break_ocup, missing=999999, value_labels=labels_ocup)


break_ocup <- c(-999,-998,-888,1)
labels_ocup <- c("Refused to answer"=1,
                 "Don't know"=2,
                 "Other: Specify"=3,
                 "Other"=4)
mydata <- ordinal_recode (variable="m_s8q3__3", break_points=break_ocup, missing=999999, value_labels=labels_ocup)


break_ocup <- c(-999,-998,-888, 1,4,14,20,21,22,27,30,34,37,43,44)
labels_ocup <- c("Refused to answer"=1,
                 "Don't know"=2,
                  "Other: Specify"=3,
                 "Other"=4,
                 "Other"=5,
                 "Other"=6,
                 "Other"=7,
                 "Other"=8,
                 "Other"=9,
                 "Other"=10,
                 "Other"=11,
                 "Street and related sales and service workers"=12,
                 "Other"=13,
                 "Other"=14,
                 "Other"=15,
                 "Other"=16)
mydata <- ordinal_recode (variable="m_s8q12__1", break_points=break_ocup, missing=999999, value_labels=labels_ocup)



break_ocup <- c(-999,-998,-888, 1)
labels_ocup <- c("Refused to answer"=1,
                 "Don't know"=2,
                 "Other: Specify"=3,
                 "Other"=4)
mydata <- ordinal_recode (variable="m_s8q12__2", break_points=break_ocup, missing=999999, value_labels=labels_ocup)


break_ocup <- c(-999,-998,-888, 1,7,16,20,21,24,27,29,30,34,37)
labels_ocup <- c("Refused to answer"=1,
                 "Don't know"=2,
                 "Other: Specify"=3,
                 "Other"=4,
                 "Other"=5,
                 "Other"=6,
                 "Other"=7,
                 "Other"=8,
                 "Other"=9,
                 "Other"=10,
                 "Other"=11,
                 "Other"=12,
                 "Street and related sales and service workers"=13,
                 "Other"=14)
mydata <- ordinal_recode (variable="m_s8q21__1", break_points=break_ocup, missing=999999, value_labels=labels_ocup)

break_ocup <- c(-999,-998,-888,27,34)
labels_ocup <- c("Refused to answer"=1,
                 "Don't know"=2,
                 "Other: Specify"=3,
                  "Other"=4,
                 "Other"=5)
mydata <- ordinal_recode (variable="m_s8q21__2", break_points=break_ocup, missing=999999, value_labels=labels_ocup)



break_ocup <- c(34)
labels_ocup <- c("Other"=1)
mydata <- ordinal_recode (variable="m_s8q21__3", break_points=break_ocup, missing=999999, value_labels=labels_ocup)


#'# Matching and crosstabulations: Run automated PII check 
# !!!Insufficient demographic data

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("m_s8_new_name__1",
               "m_s8q3_other__1",
               "m_s8q5_other__1",
               "m_s8_new_name__2",
               "m_s8q3_other__2",
               "m_s8q5_other__2",
               "m_s8_new_name__3",
               "m_s8q3_other__3",
               "m_s8q5_other__3",
               "m_s8_expand_name__1",
               "m_s8q12_other__1",
               "m_s8q14_other__1",
               "m_s8_expand_name__2",
               "m_s8q12_other__2",
               "m_s8q14_other__2",
               "m_s8_close_name__1",
               "m_s8q21_other__1",
               "m_s8q25_other__1",
               "m_endnote8__1",
               "m_s8_close_name__2",
               "m_s8q21_other__2",
               "m_s8q25_other__2",
               "m_endnote8__2",
               "m_s8_close_name__3",
               "m_s8q21_other__3",
               "m_s8q25_other__3",
               "m_endnote8__3")
report_open (list_open_ends = open_ends)


# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$m_s8q3_other__1[4] <- "Sales workers"
mydata$m_s8q3_other__1[15] <- "Sales workers"
mydata$m_s8q3_other__1[38] <- "Sales workers"
mydata$m_s8q3_other__1[51] <- "Sales workers"
mydata$m_s8q3_other__1[55] <- "Sales workers"
mydata$m_s8q3_other__1[79] <- "Sales workers"
mydata$m_s8q3_other__1[114] <- "Plant and machine operators, and assemblers"
mydata$m_s8q3_other__1[146] <- "Sales workers"
mydata$m_s8q3_other__1[166] <- "Sales workers"
mydata$m_s8q3_other__1[195] <- "Sales workers"
mydata$m_s8q3_other__1[300] <- "[language]"
mydata$m_s8q3_other__1[766] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$m_s8q3_other__1[785] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$m_s8q3_other__1[882] <- "Sales workers"
mydata$m_s8q3_other__1[935] <- "Sales workers"
mydata$m_s8q3_other__1[1020] <- "Sales workers"
mydata$m_s8q3_other__1[1088] <- "Sales workers"
mydata$m_s8q3_other__1[1095] <- "Sales workers"
mydata$m_s8q3_other__1[1143] <- "Sales workers"
mydata$m_s8q3_other__1[1174] <- "Sales workers"
mydata$m_s8q3_other__1[1184] <- "Sales workers"
mydata$m_s8q3_other__1[1214] <- "Sales workers"
mydata$m_s8q3_other__1[1220] <- "Sales workers"
mydata$m_s8q3_other__1[1223] <- "Sales workers"
mydata$m_s8q3_other__1[1225] <- "Plant and machine operators, and assemblers"
mydata$m_s8q3_other__1[1256] <- "Sales workers"
mydata$m_s8q3_other__1[1393] <- "Sales workers"
mydata$m_s8q3_other__1[1397] <- "Sales workers"
mydata$m_s8q3_other__1[1433] <- "Sales workers"
mydata$m_s8q3_other__1[1469] <- "Sales workers"
mydata$m_s8q3_other__1[1507] <- "Sales workers"
mydata$m_s8q3_other__1[1594] <- "Sales workers"
mydata$m_s8q3_other__1[1714] <- "Sales workers"
mydata$m_s8q3_other__1[1792] <- "Sales workers"
mydata$m_s8q3_other__1[1793] <- "Sales workers"
mydata$m_s8q3_other__1[1829] <- "[language]"
mydata$m_s8q3_other__1[1853] <- "Sales workers"
mydata$m_s8q3_other__1[1997] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$m_s8q3_other__1[2000] <- "Sales workers"
mydata$m_s8q3_other__1[2239] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$m_s8q3_other__1[2282] <- "Food processing, wood working, garment and other craft and related trades workers"

mydata$m_s8q3_other__2[134] <- "Sales workers"
mydata$m_s8q3_other__2[1095] <- "Sales workers"
mydata$m_s8q3_other__2[1141] <- "Sales workers"
mydata$m_s8q3_other__2[1220] <- "Sales workers"
mydata$m_s8q3_other__2[1221] <- "Sales workers"
mydata$m_s8q3_other__2[1973] <- "Sales workers"
mydata$m_s8q3_other__2[1997] <- "Sales workers"

mydata$m_s8q12_other__1[90] <- "Sales workers"
mydata$m_s8q12_other__1[445] <- "Legal, social and cultural professionals"
mydata$m_s8q12_other__1[542] <- "Sales workers"
mydata$m_s8q12_other__1[739] <- "Handicraft and printing workers"
mydata$m_s8q12_other__1[771] <- "Sales workers"
mydata$m_s8q12_other__1[819] <- "[language]"
mydata$m_s8q12_other__1[820] <- "Sales workers"
mydata$m_s8q12_other__1[835] <- "Handicraft and printing workers"
mydata$m_s8q12_other__1[1006] <- "Sales workers"
mydata$m_s8q12_other__1[1128] <- "Sales workers"
mydata$m_s8q12_other__1[1132] <- "Sales workers"
mydata$m_s8q12_other__1[1134] <- "Sales workers"
mydata$m_s8q12_other__1[1151] <- "Sales workers"
mydata$m_s8q12_other__1[1155] <- "Labourers in mining, construction, manufacturing and transport"
mydata$m_s8q12_other__1[1163] <- "Sales workers"
mydata$m_s8q12_other__1[1380] <- "Sales workers"
mydata$m_s8q12_other__1[1430] <- "Sales workers"
mydata$m_s8q12_other__1[1466] <- "Sales workers"
mydata$m_s8q12_other__1[1468] <- "Sales workers"
mydata$m_s8q12_other__1[1469] <- "Sales workers"
mydata$m_s8q12_other__1[1470] <- "Information and communications technology professionals"
mydata$m_s8q12_other__1[1690] <- "Handicraft and printing workers"
mydata$m_s8q12_other__1[1699] <- "Sales workers"
mydata$m_s8q12_other__1[1739] <- "Sales workers"
mydata$m_s8q12_other__1[2223] <- "Sales workers"

mydata$m_s8q12_other__2[942] <- "[language]"
mydata$m_s8q12_other__2[1130] <- "Sales workers"

mydata$m_s8q21_other__1[89] <- "Food processing, wood working, garment and other craft and related trades workers"
mydata$m_s8q21_other__1[129] <- "Sales workers"
mydata$m_s8q21_other__1[149] <- "Sales workers"
mydata$m_s8q21_other__1[244] <- "Sales workers"
mydata$m_s8q21_other__1[347] <- "Sales workers"
mydata$m_s8q21_other__1[952] <- "Other"
mydata$m_s8q21_other__1[1214] <- "Sales workers"
mydata$m_s8q21_other__1[1244] <- "Sales workers"
mydata$m_s8q21_other__1[1275] <- "Sales workers"
mydata$m_s8q21_other__1[1277] <- "Sales workers"
mydata$m_s8q21_other__1[1324] <- "Sales workers"
mydata$m_s8q21_other__1[1468] <- "Sales workers"

mydata$m_s8q25_other__1[749] <- "[language]"


mydata$m_endnote8__3[327] <- "[language]"


mydata$m_endnote8__1[253] <- "[language]"
mydata$m_endnote8__1[261] <- "They just close the business to avoid [language]"
mydata$m_endnote8__1[262] <- "[language]"
mydata$m_endnote8__1[286] <- "[language]"
mydata$m_endnote8__1[302] <- "[language]"
mydata$m_endnote8__1[303] <- "[language]"
mydata$m_endnote8__1[305] <- "[name] selling fish started last February for 2 weeks And she just recieved her DOLE livelihood program last Wednesday. She said that she just wait for timing and restart her fish vending business."
mydata$m_endnote8__1[318] <- "[name] said that she closed her fishball vending because of her youngest son. [languages]"
mydata$m_endnote8__1[749] <- "[language]"
mydata$m_endnote8__1[1220] <- "They used to sell goods to the canteen where [name] is working but when class closes, same with the canteen, so the business stopped. But they managed to put up an online shop using [name] fb account, using the profit they earned in selling rice. Also, this year they started to cater people with bulk orders of packed meals using the downpayment given by the customers."


# !!!Remove, as it contains sensitive information 
mydata <- mydata[!names(mydata) %in% "m_s8_new_name__1"]
mydata <- mydata[!names(mydata) %in% "m_s8_new_name__2"]
mydata <- mydata[!names(mydata) %in% "m_s8_new_name__3"]
mydata <- mydata[!names(mydata) %in% "m_s8_expand_name__1"]
mydata <- mydata[!names(mydata) %in% "m_s8_expand_name__2"]
mydata <- mydata[!names(mydata) %in% "m_s8_close_name__1"]
mydata <- mydata[!names(mydata) %in% "m_s8_close_name__2"]
mydata <- mydata[!names(mydata) %in% "m_s8_close_name__3"]



#'# GPS data: Displace
# !!!No GPS data

#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
