#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "DFM_InDepth20162017_ParentsStudents_NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)

#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 


#'# Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("student_name",
              "p_no_guardian_name",
              "sap_househead_name",
              "consent_signature",
              "consent_signature_paper",
              "hh_name1",
              "hh_name2",
              "hh_lastname1",
              "hh_lastname2",
              "hh_dni",
              "nombres",
              "i31",
              "address",
              "reference",
              "i32",
              "random_audio_hh",
              "jefe_nom",
              "jefe_priape",
              "jefe_segape",
              "bf1_fname",
              "bf1_sname",
              "bf1_flastname",
              "bf1_slastname",
              "bf2_fname",
              "bf2_sname",
              "bf2_flastname",
              "bf2_slastname",
              "bf3_fname",
              "bf3_sname",
              "bf3_flastname",
              "bf3_slastname",
              "DNI",
              "devicephonenum",
              "student_fullname",
              "no_guardian_name",
              "guard_name",
              "guard_app",
              "guard_apm",
              "guard_nn",
              "p4a",
              "audio1_student",
              "audio2_student",
              "audio3_student",
              "ss_phone",
              "ss_photo") 
mydata <- mydata[!names(mydata) %in% dropvars]

#'# Direct PII-team: Encode field team names

# !!!Replace vector in "variables" field below with relevant variable names
mydata <- encode_direct_PII_team (variables=c("id_encuestador"))

# !!! Removed as it contains identifying information
dropvars <- c("DIGITA","i4") 
mydata <- mydata[!names(mydata) %in% dropvars]

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
# !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("p_cod_mod2016_admin",
             "cod_mod_2016",
             "cod_mod_2015",
             "cod_mod_app",
             "codlocal",
             "distrito",
             "CODLOC",
             "COD_MOD_2015",
             "COD_MOD_2016",
             "CODLOCAL_2016",
             "COD_MOD_2017",
             "CODLOCAL_2017",
             "cod_mod2016_admin",
             "cole2016_admin") 
mydata <- encode_location (variables= locvars, missing=999999)

# !!! Removed as it contains identifying information

dropvars <- c("p_sc_info_311",
              "p_sc_info_321",
              "p_sc_info_331",
              "p_sc_info_341",
              "p_sc_info_351",
              "p_sc_info_361",
              "p_sc_info_371",
              "p_sc_info_381",
              "p_sc_info_391",
              "NOMESC",
              "nombre_colegio",
              "prompt_cole_name",
              "cole2016_correct",
              "cole2016_new",
              "cole2016",
              "pref19b",
              "school2015_name",
              "school2015_name1",
              "school2015_name1_extra",
              "school2014_name",
              "school2014_name1",
              "school2014_name1_extra",
              "school2013_name",
              "school2013_name1",
              "school2013_name1_extra",
              "school2012_name",
              "school2012_name1",
              "school2012_name1_extra",
              "school2011_name",
              "school2011_name1",
              "school2011_name1_extra",
              "school2010_name",
              "school2010_name1",
              "school2010_name1_extra",
              "hs_gps_where",
              "ss_gps_where") 
mydata <- mydata[!names(mydata) %in% dropvars]


#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

mydata <- top_recode ("hh_ageinyears", break_point=59, missing=c(888, 999999)) 
mydata <- top_recode ("age_hh", break_point=59, missing=c(888, 999999)) 

mydata$age <- trunc(mydata$age)
mydata <- top_recode ("age", break_point=59, missing=c(888, 999999)) 

mydata <- mydata[!names(mydata) %in% "hh_birthdate"]


# Remove as it constains identifying education

mydata <- mydata[!names(mydata) %in% "i15"]
mydata <- mydata[!names(mydata) %in% "i16"]
mydata <- mydata[!names(mydata) %in% "birthdate"]


# Recode education attainment of adults to reduce risk of re-identification 


break_edu <- c(-98,1,3,4,5,6,7,8,9)
labels_edu <- c("No se"=1,
                "Pri Incomp or less"=2,
                "Pri Comp"=3,
                "Sec Incomp"=4,
                "Sec Comp"=5,
                "Tec Incomp"=6,
                "Tec Comp"=7,
                "Uni Incomp"=8,
                "Uni Comp"=9)
mydata <- ordinal_recode (variable="K_MaxEducLevel_hhpartner", break_points=break_edu, missing=999999, value_labels=labels_edu)


break_edu <- c(-98,1,2,3,4,5,6,7,8)
labels_edu <- c("No se"=1,
                "Sin nivel"=2,
                "Pri Incomp"=3,
                "Pri Comp"=4,
                "Sec Incomp"=5,
                "Sec Comp"=6,
                "Tec Incomp"=7,
                "Tec Comp"=8,
                "Uni Incomp/Comp"=9)
mydata <- ordinal_recode (variable="K_MaxEducLevel_a2", break_points=break_edu, missing=999999, value_labels=labels_edu)


break_edu <- c(-98,1,2,3,4,5,6)
labels_edu <- c("No se"=1,
                "Sin nivel"=2,
                "Pri Incomp"=3,
                "Pri Comp"=4,
                "Sec Incomp"=5,
                "Sec Comp"=6,
                "Tec Incomp/Comp or Uni Incomp/Comp"=7)
mydata <- ordinal_recode (variable="K_MaxEducLevel_a3", break_points=break_edu, missing=999999, value_labels=labels_edu)


break_edu <- c(-98,1,3,4,6)
labels_edu <- c("No se"=1,
                "Pri Incomp or less"=2,
                "Pri Comp"=3,
                "Sec Incomp/Comp"=4,
                "Tec Incomp/Comp or Uni Incomp/Comp"=5)
mydata <- ordinal_recode (variable="K_MaxEducLevel_a4", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-98,1,4)
labels_edu <- c("No se"=1,
                "Pri Incomp/Comp or less"=2,
                "Sec Incomp/Comp or more"=3)
mydata <- ordinal_recode (variable="K_MaxEducLevel_a5", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-11,-10,-9,-8,-7,1,2,3,4,5,6)
labels_edu <- c("Missing - Zenekon"=1,
                "Missing - IPA"=2,
                "No indica"=3,
                "No se puede leer"=4,
                "Error"=5,
                "No termino secundaria"=6,
                "Termino secundaria"=7,
                "Termino carrera tecnica"=8,
                "Termino carrera universitaria"=9,
                "No se"=10,
                "Otro"=11)
mydata <- ordinal_recode (variable="mom_edu", break_points=break_edu, missing=999999, value_labels=labels_edu)



# Top code household composition variables with large and unusual numbers 


mydata <- top_recode ("p1", break_point=10, missing=c(888, 999999)) 
mydata <- top_recode ("p2c", break_point=5, missing=c(888, 999999)) 
mydata <- top_recode ("p2d", break_point=3, missing=c(888, 999999)) 
mydata <- top_recode ("p2e", break_point=4, missing=c(888, 999999)) 
mydata <- top_recode ("p2f", break_point=3, missing=c(888, 999999))
mydata <- top_recode ("p2g", break_point=4, missing=c(888, 999999)) 

# Top code number of rooms of the house

mydata <- top_recode ("p9", break_point=7, missing=c(888, 999999)) 


# Top code number of siblings studying in the same school

mydata <- top_recode ("p7a1", break_point=2, missing=c(888, 999999)) 
mydata <- top_recode ("p7b1", break_point=3, missing=c(888, 999999)) 


# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_hh)[na.exclude(mydata$inc_hh)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_hh", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_hhpartner)[na.exclude(mydata$inc_hhpartner)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_hhpartner", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_b1)[na.exclude(mydata$inc_b1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_b1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_c1)[na.exclude(mydata$inc_c1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_c1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_c2)[na.exclude(mydata$inc_c2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_c2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_c3)[na.exclude(mydata$inc_c3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_c3", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_c4)[na.exclude(mydata$inc_c4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_c4", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_c5)[na.exclude(mydata$inc_c5)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_c5", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_c6)[na.exclude(mydata$inc_c6)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_c6", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_d1)[na.exclude(mydata$inc_d1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_d1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_d2)[na.exclude(mydata$inc_d2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_d2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$inc_total)[na.exclude(mydata$inc_total)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="inc_total", break_point=percentile_99.5, missing=999999)



percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_hh)[na.exclude(mydata$whour_hh)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_hh", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_hhpartner)[na.exclude(mydata$whour_hhpartner)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_hhpartner", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_b1)[na.exclude(mydata$whour_b1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_b1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_b2)[na.exclude(mydata$whour_b2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_b2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_c1)[na.exclude(mydata$whour_c1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_c1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_c2)[na.exclude(mydata$whour_c2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_c2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_c3)[na.exclude(mydata$whour_c3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_c3", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_c4)[na.exclude(mydata$whour_c4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_c4", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_c5)[na.exclude(mydata$whour_c5)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_c5", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_c6)[na.exclude(mydata$whour_c6)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_c6", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_d1)[na.exclude(mydata$whour_d1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_d1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$whour_d2)[na.exclude(mydata$whour_d2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="whour_d2", break_point=percentile_99.5, missing=999999)



percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_hh)[na.exclude(mydata$wsL_hh)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_hh", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_hhpartner)[na.exclude(mydata$wsL_hhpartner)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_hhpartner", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_b1)[na.exclude(mydata$wsL_b1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_b1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_b2)[na.exclude(mydata$wsL_b2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_b2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_b3)[na.exclude(mydata$wsL_b3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_b3", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_c1)[na.exclude(mydata$wsL_c1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_c1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_c2)[na.exclude(mydata$wsL_c2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_c2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_c3)[na.exclude(mydata$wsL_c3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_c3", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_c4)[na.exclude(mydata$wsL_c4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_c4", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_c5)[na.exclude(mydata$wsL_c5)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_c5", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_c6)[na.exclude(mydata$wsL_c6)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_c6", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_d1)[na.exclude(mydata$wsL_d1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_d1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_d2)[na.exclude(mydata$wsL_d2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_d2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$wsL_d3)[na.exclude(mydata$wsL_d3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="wsL_d3", break_point=percentile_99.5, missing=999999)


# Top code high education outlay to the 99.5 percentile

percentile_99.5 <- floor(quantile(na.exclude(mydata$p49)[na.exclude(mydata$p9)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p49", break_point=percentile_99.5, missing=999999)

#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)


indirect_PII <- c("p4c1",
                  "p4_2",
                  "p4b1",
                  "p4c2",
                  "p4_1",
                  "p4c3",
                  "p4c4",
                  "p8",
                  "p4b3",
                  "p4c5",
                  "p4c6",
                  "idioma1",
                  "idioma2")

capture_tables (indirect_PII)


# Recode those with very specific values. 

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                "Otros"=2,
                "Trabajo remunerado"=3,
                "Quehaceres del hogar o trabajo no remunerado"=4,
                "Otros"=5)
mydata <- ordinal_recode (variable="p4c1", break_points=break_activity, missing=999999, value_labels=labels_activity)


break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Estudia y tiene un trabajo remunerado"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "No hace nada"=5)
mydata <- ordinal_recode (variable="p4_2", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Estudia y tiene un trabajo remunerado"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "No hace nada"=5)
mydata <- ordinal_recode (variable="p4b1", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Otros"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "Otros"=5)
mydata <- ordinal_recode (variable="p4c2", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Estudia y tiene un trabajo remunerado"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "No hace nada"=5)
mydata <- ordinal_recode (variable="p4_1", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Otros"=2,
                     "Trabajo remunerado"=3,
                     "Otros"=4,
                     "Otros"=5)
mydata <- ordinal_recode (variable="p4c3", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_material <- c(1,2,3,4,5,6,7,8,99)
labels_material <- c("Cemento"=1,
                     "Tejas"=2,
                     "Calamina de metal o metal"=3,
                     "Calamina de plastico o plastico"=4,
                     "Madera"=5,
                     "Otro"=6,
                     "Adobe"=7,
                     "Otro"=8,
                     "Otro"=9)
mydata <- ordinal_recode (variable="p8", break_points=break_material, missing=999999, value_labels=labels_material)

break_language <- c(-11,-10,-9,-8,-7,1,2)
labels_language <- c("Missing - Zenekon"=1,
                     "Missing - IPA"=2,
                     "No indica"=3,
                     "No se puede leer"=4,
                     "Error"=5,
                     "Castellano - Espanol"=6,
                     "Otro"=7)
mydata <- ordinal_recode (variable="idioma1", break_points=break_language, missing=999999, value_labels=labels_language)


break_language <- c(-11,-10,-9,-8,-7,1)
labels_language <- c("Missing - Zenekon"=1,
                     "Missing - IPA"=2,
                     "No indica"=3,
                     "No se puede leer"=4,
                     "Error"=5,
                     "Otros"=6)
mydata <- ordinal_recode (variable="idioma2", break_points=break_language, missing=999999, value_labels=labels_language)




#'# Matching and crosstabulations: Run automated PII check 


# selected categorical key variables: gender, occupation/education and age
selectedKeyVars= c('hh_ageinyears', 'd_mujer','grado2016_admin') ##!!! Replace with candidate categorical demo vars

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial


#' Show values of key variable of records that violate k-anonymity
mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
mydata [notAnon,"d_mujer"] <- NA
createSdcObj(dat = mydata, keyVars = selectedKeyVars)

#'# Open-ends: review responses for any sensitive information, redact as necessary

# !!! Identify open-end variables here: 
open_ends <- c("hh_parentesco_other",
               "p_ave_finance_2a",
               "p_sc_info_2",
               "p_sc_info_312",
               "p_sc_info_322",
               "p_sc_info_332",
               "p_sc_info_342",
               "p_sc_info_352",
               "p_sc_info_362",
               "p_sc_info_372",
               "p_sc_info_382",
               "p_sc_info_392",
               "p_sc_info_5_1st",
               "p_sc_info_5_2nd",
               "p_sc_info_5_3rd",
               "p_sc_info_8",
               "p_sc_info_74",
               "p_sc_info_310",
               "p8a",
               "pref65f",
               "pref66f",
               "p44c",
               "i19a1",
               "p_ave_plans_45",
               "p_ave_returns_10_3",
               "name_schship",
               "p4b",
               "p13c1",
               "s_ave_finance_2a",
               "q48",
               "act_sd_4o",
               "act_sd_5o",
               "act_sd_6o",
               "act_sd_7o",
               "act_sd_8o",
               "act_sd_9o",
               "act_sd_10o",
               "act_sd_11o",
               "act_sd_12o",
               "act_sd_13o",
               "act_sd_14o",
               "act_sd_15o",
               "act_sd_16o",
               "act_sd_17o",
               "act_sd_18o",
               "act_sd_19o",
               "act_sd_20o",
               "act_sd_21o",
               "act_sd_22o",
               "act_sd_23o",
               "act_sd_24o",
               "act_sd_1o",
               "act_sd_2o",
               "act_sd_3o",
               "act_wed_4o",
               "act_wed_5o",
               "act_wed_6o",
               "act_wed_7o",
               "act_wed_8o",
               "act_wed_9o",
               "act_wed_10o",
               "act_wed_11o",
               "act_wed_12o",
               "act_wed_13o",
               "act_wed_14o",
               "act_wed_15o",
               "act_wed_16o",
               "act_wed_17o",
               "act_wed_18o",
               "act_wed_19o",
               "act_wed_20o",
               "act_wed_21o",
               "act_wed_22o",
               "act_wed_23o",
               "act_wed_24o",
               "act_wed_1o",
               "act_wed_2o",
               "act_wed_3o",
               "pref19a",
               "pref15f",
               "pref16f",
               "p35a1",
               "hs_gps_whereother",
               "s_ave_returns_11_3",
               "ss_gps_whereother")

report_open (list_open_ends = open_ends)


# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 


mydata <- mydata[!names(mydata) %in%  "hh_parentesco_other"]
mydata <- mydata[!names(mydata) %in%  "p_ave_finance_2a"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_2"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_312"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_322"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_332"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_342"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_352"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_362"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_372"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_382"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_392"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_5_1st"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_5_2nd"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_5_3rd"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_8"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_74"]
mydata <- mydata[!names(mydata) %in%  "p_sc_info_310"]
mydata <- mydata[!names(mydata) %in%  "p8a"]
mydata <- mydata[!names(mydata) %in%  "pref65f"]
mydata <- mydata[!names(mydata) %in%  "pref66f"]
mydata <- mydata[!names(mydata) %in%  "p44c"]
mydata <- mydata[!names(mydata) %in%  "i19a1"]
mydata <- mydata[!names(mydata) %in%  "p_ave_plans_45"]
mydata <- mydata[!names(mydata) %in%  "p_ave_returns_10_3"]
mydata <- mydata[!names(mydata) %in%  "p13c1"]
mydata <- mydata[!names(mydata) %in%  "s_ave_finance_2a"]
mydata <- mydata[!names(mydata) %in%  "q48"]
mydata <- mydata[!names(mydata) %in%  "act_sd_4o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_5o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_6o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_7o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_8o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_9o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_10o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_11o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_12o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_13o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_14o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_15o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_16o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_17o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_18o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_19o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_20o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_21o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_22o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_23o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_24o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_1o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_2o"]
mydata <- mydata[!names(mydata) %in%  "act_sd_3o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_4o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_5o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_6o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_7o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_8o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_9o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_10o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_11o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_12o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_13o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_14o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_15o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_16o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_17o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_18o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_19o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_20o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_21o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_22o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_23o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_24o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_1o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_2o"]
mydata <- mydata[!names(mydata) %in%  "act_wed_3o"]
mydata <- mydata[!names(mydata) %in%  "pref19a"]
mydata <- mydata[!names(mydata) %in%  "pref15f"]
mydata <- mydata[!names(mydata) %in%  "pref16f"]
mydata <- mydata[!names(mydata) %in%  "hs_gps_whereother"]
mydata <- mydata[!names(mydata) %in%  "s_ave_returns_11_3"]
mydata <- mydata[!names(mydata) %in%  "ss_gps_whereother"]
mydata <- mydata[!names(mydata) %in%  "p4b"]
mydata <- mydata[!names(mydata) %in%  "p35a1"]
mydata <- mydata[!names(mydata) %in%  "name_schship"]

#'# GPS data: Displace
# Setup map

countrymap <- map_data("world") %>% filter(region=="Peru")  #!!! Select correct country
admin <- raster::getData("GADM", country="PE", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2

# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement. 

gps.vars <- c("i19longitude", "i19latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

mydata <- mydata[!names(mydata) %in%  "i19altitude"]
mydata <- mydata[!names(mydata) %in%  "gpsaltitude_hh"]
mydata <- mydata[!names(mydata) %in%  "gpsaltitude"]

#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))

colnames(mydata) <- gsub('^_', '', colnames(mydata))
names(mydata)[names(mydata) == "ANEXO_2016"] <- "ANEXO_2016_1"
names(mydata)[names(mydata) == "COD_MOD_2015"] <- "COD_MOD_2015_1"
names(mydata)[names(mydata) == "COD_MOD_2016"] <- "COD_MOD_2016_1"
mydata[is.na(mydata)] <- NA
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
