#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "Section_5" # !!!Update filename
functions_vers <-  "functions_1.8.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)

#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#'# Direct PII: variables to be removed
# !!!No Direct PII

#'# Direct PII-team: Encode field team names
# !!!No Direct PII-team

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!No small locations

#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

mydata <- top_recode ("s5q1", break_point=3, missing=c(888, 999999))
mydata <- top_recode ("s5q3", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("s5q5", break_point=6, missing=c(888, 999999))
mydata <- top_recode ("s5q7", break_point=4, missing=c(888, 999999))
mydata <- top_recode ("s5q9", break_point=9, missing=c(888, 999999))
mydata <- top_recode ("s5q11", break_point=4, missing=c(888, 999999))
mydata <- top_recode ("s5q13", break_point=4, missing=c(888, 999999))
mydata <- top_recode ("s5q15", break_point=3, missing=c(888, 999999))
mydata <- top_recode ("s5q17", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q19", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q21", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q23", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("s5q25", break_point=3, missing=c(888, 999999))
mydata <- top_recode ("s5q27", break_point=4, missing=c(888, 999999))
mydata <- top_recode ("s5q29", break_point=4, missing=c(888, 999999))
mydata <- top_recode ("s5q33", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("s5q35", break_point=3, missing=c(888, 999999))
mydata <- top_recode ("s5q37", break_point=3, missing=c(888, 999999))
mydata <- top_recode ("s5q39", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q41", break_point=6, missing=c(888, 999999))
mydata <- top_recode ("s5q43", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("s5q45", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q47", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q49", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q51", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q53", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q55", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("s5q56a", break_point=4, missing=c(888, 999999))
mydata <- top_recode ("s5q56fishnet", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q56pedicab", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("s5q56ricestock", break_point=4, missing=c(888, 999999))


# Top code high values to the 99.5 percentile

percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q2)[na.exclude(mydata$s5q2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q4)[na.exclude(mydata$s5q4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q4", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q6)[na.exclude(mydata$s5q6)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q6", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q8)[na.exclude(mydata$s5q8)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q8", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q10)[na.exclude(mydata$s5q10)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q10", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q12)[na.exclude(mydata$s5q12)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q12", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q14)[na.exclude(mydata$s5q14)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q14", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q16)[na.exclude(mydata$s5q16)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q16", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q18)[na.exclude(mydata$s5q18)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q18", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q20)[na.exclude(mydata$s5q20)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q20", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q22)[na.exclude(mydata$s5q22)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q22", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q24)[na.exclude(mydata$s5q24)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q24", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q26)[na.exclude(mydata$s5q26)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q26", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q28)[na.exclude(mydata$s5q28)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q28", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q30)[na.exclude(mydata$s5q30)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q30", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q32)[na.exclude(mydata$s5q32)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q32", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q34)[na.exclude(mydata$s5q34)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q34", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q36)[na.exclude(mydata$s5q36)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q36", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q38)[na.exclude(mydata$s5q38)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q38", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q40)[na.exclude(mydata$s5q40)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q40", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q42)[na.exclude(mydata$s5q42)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q42", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q44)[na.exclude(mydata$s5q44)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q44", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q46)[na.exclude(mydata$s5q46)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q46", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q48)[na.exclude(mydata$s5q48)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q48", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q50)[na.exclude(mydata$s5q50)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q50", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q52)[na.exclude(mydata$s5q52)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q52", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q54)[na.exclude(mydata$s5q54)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q54", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q56)[na.exclude(mydata$s5q56)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q56", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q56b)[na.exclude(mydata$s5q56b)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q56b", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q56fishnet2)[na.exclude(mydata$s5q56fishnet2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q56fishnet2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q56pedicab2)[na.exclude(mydata$s5q56pedicab2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q56pedicab2", break_point=percentile_99.5, missing=999999)
percentile_99.5 <- floor(quantile(na.exclude(mydata$s5q56ricestock2)[na.exclude(mydata$s5q56ricestock2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="s5q56ricestock2", break_point=percentile_99.5, missing=999999)

mydata <- mydata[!names(mydata) %in% "s5q58_month"]



#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("s5q57",
                  "s5q63",
                  "s5q65_1",
                  "s5q65_2")

capture_tables (indirect_PII)



# Recode those with very specific values. 
val_labels(mydata$s5q65_1)

break_transfers <- c(1,2,3,4)
labels_tranfers <- c("GSIS"=1,
                "Other" = 2,
                "Scholarships" = 3,
                "other: Specify"=4)
mydata <- ordinal_recode (variable="s5q65_1", break_points=break_transfers, missing=999999, value_labels=labels_tranfers)


#'# Matching and crosstabulations: Run automated PII check 
# !!! Insufficient demographic data


#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("s5q1whynoresponse",
               "s5q3whynoresponse",
               "s5q5whynoresponse",
               "s5q7whynoresponse",
               "s5q9whynoresponse",
               "s5q11whynoresponse",
               "s5q13whynoresponse",
               "s5q15whynoresponse",
               "s5q17whynoresponse",
               "s5q19whynoresponse",
               "s5q21whynoresponse",
               "s5q23whynoresponse",
               "s5q25whynoresponse",
               "s5q27whynoresponse",
               "s5q29whynoresponse",
               "s5q31whynoresponse",
               "s5q33whynoresponse",
               "s5q35whynoresponse",
               "s5q37whynoresponse",
               "s5q39whynoresponse",
               "s5q41whynoresponse",
               "s5q43whynoresponse",
               "s5q45whynoresponse",
               "s5q47whynoresponse",
               "s5q49whynoresponse",
               "s5q51whynoresponse",
               "s5q53whynoresponse",
               "s5q55whynoresponse",
               "s5q56awhynoresponse",
               "s5q56fishnetwhynoresponse",
               "s5q56pedicabwhynoresponse",
               "s5q56ricestockwhynoresponse",
               "s5q57whynoresponse",
               "s5q59whynoresponse",
               "s5q60whynoresponse",
               "s5q61whynoresponse",
               "s5q62whynoresponse",
               "s5q63whynoresponse",
               "s5q64whynoresponse",
               "s5q65whynoresponse",
               "s5q65other")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 


mydata$s5q1whynoresponse[50] <- "Materials was provided by Gold sun"
mydata$s5q1whynoresponse[97] <- "[Tagalog]"
mydata$s5q1whynoresponse[745] <- "Father owned the house and [situation]"
mydata$s5q1whynoresponse[961] <- "She cannot estimate the price because of the quality of their house"
mydata$s5q1whynoresponse[1026] <- "She does not know and does not want to answer. The house is owned by the [person]."
mydata$s5q1whynoresponse[1100] <- "Cannot estimate because the house is made only in a [type of materials]"
mydata$s5q1whynoresponse[1156] <- "Their house is made by [object]."
mydata$s5q1whynoresponse[1161] <- "Cannot assess value inherited house and lot, but  the house and lot is big I think is almost [amount] sq.m."
mydata$s5q1whynoresponse[1448] <- "Materials came from [site]."
mydata$s5q1whynoresponse[1449] <- "[Type of materials]"

mydata$s5q7whynoresponse[1512] <- "They dont want to give any amount even if ill tell is is it [amount redacted]???"
mydata$s5q9whynoresponse[139] <- "[name] only repaired it"

mydata$s5q13whynoresponse[157] <- "[Tagalog]"
mydata$s5q13whynoresponse[139] <- "Came from a [person]"

mydata$s5q29whynoresponse[1451] <- "Came from [foundation]"

mydata$s5q51whynoresponse[1294] <- "[Person redacted]"

mydata$s5q56awhynoresponse[615] <- "Raffle price ([amount redacted])"

mydata$s5q56fishnetwhynoresponse[1078] <- "2 or more"

mydata$s5q56ricestockwhynoresponse[55] <- "[amount redacted] rice stocks. Total value of [amount redacted]"
mydata$s5q56ricestockwhynoresponse[451] <- "[amount redacted]"
mydata$s5q56ricestockwhynoresponse[491] <- "[amount redacted] kilos of milled rice"

mydata$s5q57whynoresponse[740] <- "The mother of the 2 grandchildren is the member of the 4Ps. But the 2 children is [situation]"
mydata$s5q57whynoresponse[1237] <- "The member of the 4ps is [name] son of [name] she is the one who attended all the meetings of 4ps every now and then. [name] is not member of the roster because he is only once or every another month going back home."

mydata$s5q60whynoresponse[712] <- "He is not sure if it is more than [amount redacted] per 2 months, his wife knows"

mydata$s5q62whynoresponse[932] <- "It was stopped because they were transferred here in [site], [province] since 2012"

mydata$s5q65other[1083] <- "[Tagalog]"
mydata$s5q65other[1694] <- "[Tagalog]"
mydata$s5q65other[1812] <- "[Tagalog]"


#'# GPS data: Displace
# !!!No GPS data


#'# Save processed data in Stata and SPSS format



haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
