#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#'# Setup filenames

filename <- "DFM_InDepth20152016_StudentsParents_NOPII" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#'# Setup data, functions and create dictionary for dataset review
source (functions_vers)


#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 
#'# Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("student_name",
              "name_pad",
              "num_telf",
              "future_parent",
              "school_parent",
              "education_parent",
              "pic_home",
              "consent_signed",
              "cto_padre_nom",
              "cto_padre_app1",
              "cto_padre_app2",
              "p27a1",
              "p27a2",
              "p27a3",
              "p27a4",
              "p27a5",
              "p27a6",
              "p27a7",
              "p27a8",
              "p27a9",
              "p27a10",
              "p27d1",
              "p27d2",
              "p27d3",
              "audio_video",
              "nompad",
              "app_pad",
              "nommad",
              "app_mad",
              "address",
              "dni",
              "NUMERO_DOCUMENTO",
              "guard_male_name",
              "conf_guard_male_name",
              "guard_male_surname",
              "conf_guard_male_surname",
              "guard_female_name",
              "conf_guard_female_name",
              "guard_female_surname",
              "conf_guard_female_surname",
              "nom_dist",
              "nombres",
              "fecha_nac_fixed",
              "audio1_student",
              "audio2_student",
              "audio3_student") 
mydata <- mydata[!names(mydata) %in% dropvars]

#'# Direct PII-team: Encode field team names
# !!!Replace vector in "variables" field below with relevant variable names

mydata <- mydata[!names(mydata) %in% "i5"]
mydata <- encode_direct_PII_team (variables="id_encuestador")

#'# Small locations: Encode locations  with pop <100,000 using random large numbers
#  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("cod_mod_app",
             "Distrito",
             "Provincia",
             "prov",
             "dist",
             "cod_mod2",
             "COD_MOD",
             "cod_mod",
             "school_fixed_primary",
             "school_fixed_sec",
             "cole2016_admin",
             "cod_mod_2016",
             "cod_mod_2015",
             "p12",
             "codlocal",
             "s4p11b1_2015") 
mydata <- encode_location (variables= locvars, missing=999999)

# !!! Removed as it contains identifying information

dropvars <- c("nombre_colegio",
              "school2014_name",
              "school2014_name1",
              "school2013_name",
              "school2013_name1")

mydata <- mydata[!names(mydata) %in% dropvars]


#'# Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 


# Recode education attainment of adults to reduce risk of re-identification 


break_edu <- c(10,12,13)
labels_edu <- c("1ro-2do de secundaria"=1,
                "3ro de secundaria"=2,
                "4to-5to de secundaria"=3)
mydata <- ordinal_recode (variable="p29_1a2", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(10,12)
labels_edu <- c("1ro-2do de secundaria"=1,
                "3ro de secundaria or more"=2)
mydata <- ordinal_recode (variable="p29_1a3", break_points=break_edu, missing=999999, value_labels=labels_edu)


break_edu <- c(-98,0,2,3)
labels_edu <- c("No se"=1,
                "Inicial or Primaria"=2,
                "Secundaria"=3,
                "Superior no universitaria or more"=4)
mydata <- ordinal_recode (variable="p5a1", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-98,0,1,2,3)
labels_edu <- c("No se"=1,
                "Inicial"=2,
                "Primaria"=3,
                "Secundaria"=4,
                "Superior no universitaria or more"=5)
mydata <- ordinal_recode (variable="p5a2", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-98,0,1,2)
labels_edu <- c("No se"=1,
                "Inicial"=2,
                "Primaria"=3,
                "Secundaria or more"=4)
mydata <- ordinal_recode (variable="p5a3", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-98,0,1,2)
labels_edu <- c("No se"=1,
                "Inicial"=2,
                "Primaria"=3,
                "Secundaria or more"=4)
mydata <- ordinal_recode (variable="p5a4", break_points=break_edu, missing=999999, value_labels=labels_edu)


break_edu <- c(4,6,7,8,9)
labels_edu <- c("1ro-2do de primaria"=1,
                "3ro de primaria"=2,
                "4t0 de primaria"=3,
                "5to de primaria"=4,
                "6to de primaria"=5)
mydata <- ordinal_recode (variable="p28a1", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(4,5,6,7,8)
labels_edu <- c("1ro de primaria"=1,
                "2do de primaria"=2,
                "3r0 de primaria"=3,
                "4to de primaria"=4,
                "5to-6to de primaria"=5)
mydata <- ordinal_recode (variable="p28a2", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(4,5,6,7,8)
labels_edu <- c("1ro de primaria"=1,
                "2do de primaria"=2,
                "3r0 de primaria"=3,
                "4to de primaria"=4,
                "5to-6to de primaria"=5)
mydata <- ordinal_recode (variable="p28a3", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(4,6)
labels_edu <- c("1ro or 2do de primaria"=1,
                "3ro de primaria or more"=2)
mydata <- ordinal_recode (variable="p28a4", break_points=break_edu, missing=999999, value_labels=labels_edu)




break_edu <- c(-98,-1,0,1,2,3)
labels_edu <- c("No se"=1,
                "sin nivel"=2,
                "Inicial"=3,
                "Primaria completa"=4,
                "Secundaria completa"=5,
                "Superior tecnica incompleta/completa or Superior universitaria completa/incompleta"=6)
mydata <- ordinal_recode (variable="p6_1", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-98,-1,0,1,2,3)
labels_edu <- c("No se"=1,
                "sin nivel"=2,
                "Inicial"=3,
                "Primaria completa"=4,
                "Secundaria completa"=5,
                "Superior tecnica incompleta/completa or Superior universitaria completa/incompleta"=6)
mydata <- ordinal_recode (variable="p6_2", break_points=break_edu, missing=999999, value_labels=labels_edu)



break_edu <- c(-98,-1,2,3)
labels_edu <- c("No se"=1,
                "Primaria completa or less"=2,
                "Secundaria completa"=3,
                "Superior tecnica incompleta or more"=4)
mydata <- ordinal_recode (variable="p6a1", break_points=break_edu, missing=999999, value_labels=labels_edu)


break_edu <- c(-98,-1,2)
labels_edu <- c("No se"=1,
                "Primaria completa or less"=2,
                "Secundaria completa or more"=3)
mydata <- ordinal_recode (variable="p6a2", break_points=break_edu, missing=999999, value_labels=labels_edu)

break_edu <- c(-98,-1,2)
labels_edu <- c("No se"=1,
                "Primaria completa or less"=2,
                "Secundaria completa or more"=3)
mydata <- ordinal_recode (variable="p6a3", break_points=break_edu, missing=999999, value_labels=labels_edu)



break_edu <- c(-98,-1,1)
labels_edu <- c("No se"=1,
                "Inicial or less"=2,
                "Primaria completa or more"=3)
mydata <- ordinal_recode (variable="p6b1", break_points=break_edu, missing=999999, value_labels=labels_edu)


# Top code household composition variables with large and unusual numbers 

mydata <- top_recode ("p1", break_point=10, missing=c(888, 999999)) 
mydata <- top_recode ("p2c", break_point=6, missing=c(888, 999999))
mydata <- top_recode ("p2d", break_point=2, missing=c(888, 999999))
mydata <- top_recode ("p2e", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("p2f", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("p2g", break_point=1, missing=c(888, 999999))
mydata <- top_recode ("sc_ave_3a", break_point=200, missing=c(888, 999999))
mydata <- top_recode ("sc_ave_3b", break_point=175, missing=c(888, 999999))


# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7_1)[na.exclude(mydata$p7_1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7_1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7_2)[na.exclude(mydata$p7_2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7_2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7c1)[na.exclude(mydata$p7c1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7c1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p49)[na.exclude(mydata$p49)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p49", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a1)[na.exclude(mydata$p7a1)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7a1", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a2)[na.exclude(mydata$p7a2)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7a2", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a3)[na.exclude(mydata$p7a3)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7a3", break_point=percentile_99.5, missing=999999)

percentile_99.5 <- floor(quantile(na.exclude(mydata$p7a4)[na.exclude(mydata$p7a4)!=999999], probs = c(0.995)))
mydata <- top_recode (variable="p7a4", break_point=percentile_99.5, missing=999999)

# Remove as it constains identifying education

mydata <- mydata[!names(mydata) %in% "birthdate"]


#'# Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)


indirect_PII <- c("sexo",
                  "i14",
                  "telf_yesno",
                  "gender_nino",
                  "dropout_reasons_1",
                  "dropout_reasons_2",
                  "dropout_reasons_3",
                  "dropout_reasons_4",
                  "dropout_reasons_5",
                  "dropout_reasons_6",
                  "dropout_reasons_7",
                  "dropout_reasons_8",
                  "dropout_reasons_9",
                  "dropout_reasons_10",
                  "dropout_reasons_11",
                  "dropout_reasons_12",
                  "dropout_reasons_13",
                  "dropout_reasons_14",
                  "dropout_reasons_99",
                  "p2a",
                  "p2b",
                  "p3a1",
                  "p3a2",
                  "p3a3",
                  "p3a4",
                  "p3a5",
                  "p3a6",
                  "p3a7",
                  "p3a8",
                  "p3a9",
                  "p3a10",
                  "p3b1",
                  "p3b2",
                  "p3b3",
                  "p3c1",
                  "p3c2",
                  "p3d1",
                  "p3d2",
                  "p3d3",
                  "p26a1",
                  "p26a2",
                  "p26a3",
                  "p26a4",
                  "p26a5",
                  "p26a6",
                  "p26a7",
                  "p26a8",
                  "p26a9",
                  "p26a10",
                  "p26c1",
                  "p26c2",
                  "p26d1",
                  "p26d2",
                  "p26d3",
                  "p4_1",
                  "p4_2",
                  "p4a1",
                  "p4a2",
                  "p4a3",
                  "p4a4",
                  "p4a5",
                  "p4a6",
                  "p4a7",
                  "p4a8",
                  "p4a9",
                  "p4a10",
                  "p4b1",
                  "p4b2",
                  "p4b3",
                  "p4c1",
                  "p4c2",
                  "p4d1",
                  "p4d2",
                  "p4d3",
                  "p5_aa1",
                  "p5_aa2",
                  "p5_aa3",
                  "p5_aa4",
                  "p5_aa5",
                  "p5_aa6",
                  "p5_aa7",
                  "p5_aa8",
                  "p5_aa9",
                  "p10_1",
                  "p42",
                  "p44b_1",
                  "p44b_2",
                  "p44b_3",
                  "p44b_4",
                  "p44b_5",
                  "p44b_6",
                  "p44b_7",
                  "p44b_99",
                  "nivel",
                  "p1a_1",
                  "p1a_2",
                  "p1a_3",
                  "dout_reasons_2",
                  "dout_reasons_3",
                  "dout_reasons_4",
                  "dout_reasons_5",
                  "dout_reasons_6",
                  "dout_reasons_7",
                  "dout_reasons_8",
                  "dout_reasons_9",
                  "dout_reasons_10",
                  "dout_reasons_11",
                  "dout_reasons_12",
                  "dout_reasons_13",
                  "dout_reasons_14",
                  "p16",
                  "gender",
                  "hazardous_work",
                  "worst_forms",
                  "child_labor",
                  "juntos_dist_hogar",
                  "juntos1",
                  "juntos2",
                  "juntos3",
                  "juntos4",
                  "juntos_rnu",
                  "juntos_dist",
                  "juntos",
                  "juntos_ind",
                  "pobn",
                  "pobx",
                  "D_distjuntos",
                  "school_fixed_level",
                  "D_liveswithmother",
                  "D_liveswithfather",
                  "p12c",
                  "dout_reasons",
                  "dout_reasons_1",
                  "dout_decision",
                  "genero",
                  "p22a",
                  "p22b",
                  "act_sd_4",
                  "act_sd_4a",
                  "act_sd_4b",
                  "act_sd_4c",
                  "act_sd_5",
                  "act_sd_5a",
                  "act_sd_5b",
                  "act_sd_5c",
                  "act_sd_6",
                  "act_sd_6a",
                  "act_sd_6b",
                  "act_sd_6c",
                  "act_sd_7",
                  "act_sd_7a",
                  "act_sd_7b",
                  "act_sd_7c",
                  "act_sd_8",
                  "act_sd_8a",
                  "act_sd_8b",
                  "act_sd_8c",
                  "act_sd_9",
                  "act_sd_9a",
                  "act_sd_9b",
                  "act_sd_9c",
                  "act_sd_10",
                  "act_sd_10a",
                  "act_sd_10b",
                  "act_sd_10c",
                  "act_sd_11",
                  "act_sd_11a",
                  "act_sd_11b",
                  "act_sd_11c",
                  "act_sd_12",
                  "act_sd_12a",
                  "act_sd_12b",
                  "act_sd_12c",
                  "act_sd_13",
                  "act_sd_13a",
                  "act_sd_13b",
                  "act_sd_13c",
                  "act_sd_14",
                  "act_sd_14a",
                  "act_sd_14b",
                  "act_sd_14c",
                  "act_sd_15",
                  "act_sd_15a",
                  "act_sd_15b",
                  "act_sd_15c",
                  "act_sd_16",
                  "act_sd_16a",
                  "act_sd_16b",
                  "act_sd_16c",
                  "act_sd_17",
                  "act_sd_17a",
                  "act_sd_17b",
                  "act_sd_17c",
                  "act_sd_18",
                  "act_sd_18a",
                  "act_sd_18b",
                  "act_sd_18c",
                  "act_sd_19",
                  "act_sd_19a",
                  "act_sd_19b",
                  "act_sd_19c",
                  "act_sd_20",
                  "act_sd_20a",
                  "act_sd_20b",
                  "act_sd_20c",
                  "act_sd_21",
                  "act_sd_21a",
                  "act_sd_21b",
                  "act_sd_21c",
                  "act_sd_22",
                  "act_sd_22a",
                  "act_sd_23",
                  "act_sd_23a",
                  "act_sd_24",
                  "act_sd_1",
                  "act_sd_2",
                  "act_sd_3",
                  "act_wed_4",
                  "act_wed_4a",
                  "act_wed_4b",
                  "act_wed_4c",
                  "act_wed_5",
                  "act_wed_5a",
                  "act_wed_5b",
                  "act_wed_5c",
                  "act_wed_6",
                  "act_wed_6a",
                  "act_wed_6b",
                  "act_wed_6c",
                  "act_wed_7",
                  "act_wed_7a",
                  "act_wed_7b",
                  "act_wed_7c",
                  "act_wed_8",
                  "act_wed_8a",
                  "act_wed_8b",
                  "act_wed_8c",
                  "act_wed_9",
                  "act_wed_9a",
                  "act_wed_9b",
                  "act_wed_9c",
                  "act_wed_10",
                  "act_wed_10a",
                  "act_wed_10b",
                  "act_wed_10c",
                  "act_wed_11",
                  "act_wed_11a",
                  "act_wed_11b",
                  "act_wed_11c",
                  "act_wed_12",
                  "act_wed_12a",
                  "act_wed_12b",
                  "act_wed_12c",
                  "act_wed_13",
                  "act_wed_13a",
                  "act_wed_13b",
                  "act_wed_13c",
                  "act_wed_14",
                  "act_wed_14a",
                  "act_wed_14b",
                  "act_wed_14c",
                  "act_wed_15",
                  "act_wed_15a",
                  "act_wed_15b",
                  "act_wed_15c",
                  "act_wed_16",
                  "act_wed_16a",
                  "act_wed_16b",
                  "act_wed_16c",
                  "act_wed_17",
                  "act_wed_17a",
                  "act_wed_17b",
                  "act_wed_17c",
                  "act_wed_18",
                  "act_wed_18a",
                  "act_wed_18b",
                  "act_wed_18c",
                  "act_wed_19",
                  "act_wed_19a",
                  "act_wed_19b",
                  "act_wed_19c",
                  "act_wed_20",
                  "act_wed_20a",
                  "act_wed_20b",
                  "act_wed_20c",
                  "act_wed_21",
                  "act_wed_21a",
                  "act_wed_21b",
                  "act_wed_21c",
                  "act_wed_22",
                  "act_wed_22a",
                  "act_wed_22b",
                  "act_wed_22c",
                  "act_wed_23",
                  "act_wed_23a",
                  "act_wed_24",
                  "act_wed_1",
                  "act_wed_2",
                  "act_wed_3",
                  "p25a1",
                  "p25a2",
                  "p25a3",
                  "p25b",
                  "p25c",
                  "p25d",
                  "p25e",
                  "p25_1a",
                  "p25_1b",
                  "p25_1c",
                  "p25_1d",
                  "p25_1e",
                  "p25_1f",
                  "p25_2g",
                  "p25_3h",
                  "p25_4i",
                  "p25_5j",
                  "p25_6k",
                  "p25_7l",
                  "p25_8m",
                  "p25_9n",
                  "p25_10o",
                  "p25_11p",
                  "p25_12q",
                  "p25_13r",
                  "p25_14s",
                  "p25_14t",
                  "p25_2a",
                  "p25_2b",
                  "p25_2c",
                  "p25_2d",
                  "p25_2e",
                  "p25_2f",
                  "p25_2g1",
                  "p25_2h",
                  "p25_2i",
                  "p27a",
                  "p27b",
                  "p27c",
                  "p27d",
                  "p27e",
                  "switcher_2016",
                  "switcher_2015",
                  "switcher_2014",
                  "asissted_2014",
                  "same_school2014",
                  "same_school2013",
                  "asissted_2013",
                  "s5p15a_2015",
                  "s5p12a_2015",
                  "s5p12c_2015",
                  "s5p16_2015",
                  "s6p22a_2015",
                  "s6p22b_2015",
                  "s6p25p25a1_2015",
                  "s6p25p25a2_2015",
                  "s6p25p25a3_2015",
                  "s6p25p25b_2015",
                  "s6p25p25c_2015",
                  "s6p25p25d_2015",
                  "s6p25p25e_2015",
                  "s6p25_1p25_1a_2015",
                  "s6p25_1p25_1b_2015",
                  "s6p25_1p25_1c_2015",
                  "s6p25_1p25_1d_2015",
                  "s6p25_1p25_1e_2015",
                  "s6p25_1p25_1f_2015",
                  "s6p25_1p25_2g_2015",
                  "s6p25_1p25_3h_2015",
                  "s6p25_1p25_4i_2015",
                  "s6p25_1p25_5j_2015",
                  "s6p25_1p25_6k_2015",
                  "s6p25_1p25_7l_2015",
                  "s6p25_1p25_8m_2015",
                  "s6p25_1p25_9n_2015",
                  "s6p25_1p25_10o_2015",
                  "s6p25_1p25_11p_2015",
                  "s6p25_1p25_12q_2015",
                  "s6p25_1p25_13r_2015",
                  "s6p25_1p25_14s_2015",
                  "s6p25_1p25_14t_2015",
                  "s6p25_2p25_2a_2015",
                  "s6p25_2p25_2b_2015",
                  "s6p25_2p25_2c_2015",
                  "s6p25_2p25_2d_2015",
                  "s6p25_2p25_2e_2015",
                  "s6p25_2p25_2f_2015",
                  "s6p25_2p25_2g_2015",
                  "s6p25_2p25_2h_2015",
                  "s6p25_2p25_2i_2015",
                  "s7p27a_2015",
                  "s7p27b_2015",
                  "s7p27c_2015",
                  "s7p27d_2015",
                  "s7p27e_2015",
                  "info4a_2015",
                  "info4b_2015",
                  "hazardous_work_2015",
                  "worst_forms_2015",
                  "child_labor_2015")

capture_tables (indirect_PII)


# Recode those with very specific values. 

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Otros"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "Otros"=5)
mydata <- ordinal_recode (variable="p4_1", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Otros"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "Otros"=5)
mydata <- ordinal_recode (variable="p4_2", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Estudia"=1,
                     "Otros"=2,
                     "Trabajo remunerado"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "Otros"=5)
mydata <- ordinal_recode (variable="p4a1", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Estudia"=1,
                     "Otros"=2,
                     "Otros"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "Infante pre-escolar (menor de 2 anos)"=5)
mydata <- ordinal_recode (variable="p4a2", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Estudia"=1,
                     "Otros"=2,
                     "Otros"=3,
                     "Otros"=4,
                     "Infante pre-escolar (menor de 2 anos)"=5)
mydata <- ordinal_recode (variable="p4a3", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Estudia"=1,
                     "Otros"=2,
                     "Otros"=3,
                     "Otros"=4,
                     "Infante pre-escolar (menor de 2 anos)"=5)
mydata <- ordinal_recode (variable="p4a4", break_points=break_activity, missing=999999, value_labels=labels_activity)

break_activity <- c(1,2,3,4,5)
labels_activity <- c("Estudia"=1,
                     "Otros"=2,
                     "Otros"=3,
                     "Otros"=4,
                     "Infante pre-escolar (menor de 2 anos)"=5)
mydata <- ordinal_recode (variable="p4a5", break_points=break_activity, missing=999999, value_labels=labels_activity)



break_activity <- c(1,2,3,4,5)
labels_activity <- c("Otros"=1,
                     "Otros"=2,
                     "Otros"=3,
                     "Quehaceres del hogar o trabajo no remunerado"=4,
                     "Otros"=5)
mydata <- ordinal_recode (variable="p4b1", break_points=break_activity, missing=999999, value_labels=labels_activity)


#'# Matching and crosstabulations: Run automated PII check 

# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('sexo', 'grado2015') ##!!! Replace with candidate categorical demo vars
selectedKeyVars2= c('i14','p4_1')

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

sdcInitial2 <- createSdcObj(dat = mydata, keyVars = selectedKeyVars2)
sdcInitial2

#' Show values of key variable of records that violate k-anonymity
mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

notAnon2 <- sdcInitial2@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon2,selectedKeyVars2]
sdcFinal2 <- localSuppression(sdcInitial2)

#'# Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("dropout_reasons_otro",
               "rp_finance_2a",
               "p15a_prop",
               "p44c",
               "p51a",
               "centro_poblado",
               "referencia",
               "school_fixed",
               "p11b",
               "p13c1",
               "rs_finance_2a",
               "q48",
               "act_sd_4o",
               "act_sd_5o",
               "act_sd_6o",
               "act_sd_7o",
               "act_sd_8o",
               "act_sd_9o",
               "act_sd_10o",
               "act_sd_11o",
               "act_sd_12o",
               "act_sd_13o",
               "act_sd_14o",
               "act_sd_15o",
               "act_sd_16o",
               "act_sd_17o",
               "act_sd_18o",
               "act_sd_19o",
               "act_sd_20o",
               "act_sd_21o",
               "act_sd_22o",
               "act_sd_23o",
               "act_sd_24o",
               "act_sd_1o",
               "act_sd_2o",
               "act_sd_3o",
               "act_wed_4o",
               "act_wed_5o",
               "act_wed_6o",
               "act_wed_7o",
               "act_wed_8o",
               "act_wed_9o",
               "act_wed_10o",
               "act_wed_11o",
               "act_wed_12o",
               "act_wed_13o",
               "act_wed_14o",
               "act_wed_15o",
               "act_wed_16o",
               "act_wed_17o",
               "act_wed_18o",
               "act_wed_19o",
               "act_wed_20o",
               "act_wed_21o",
               "act_wed_22o",
               "act_wed_23o",
               "act_wed_24o",
               "act_wed_1o",
               "act_wed_2o",
               "act_wed_3o",
               "switcher_2016_otro",
               "switcher_2015_otro",
               "switcher_2014_otro",
               "p35a1",
               "s4p11b_2015",
               "s4p13c1_2015",
               "s5p18_2015")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata <- mydata[!names(mydata) %in% "dropout_reasons_otro"]
mydata <- mydata[!names(mydata) %in% "rp_finance_2a"]
mydata <- mydata[!names(mydata) %in% "p15a_prop"]
mydata <- mydata[!names(mydata) %in% "p44c"]
mydata <- mydata[!names(mydata) %in% "p51a"]
mydata <- mydata[!names(mydata) %in% "centro_poblado"]
mydata <- mydata[!names(mydata) %in% "referencia"]
mydata <- mydata[!names(mydata) %in% "school_fixed"]
mydata <- mydata[!names(mydata) %in% "p11b"]
mydata <- mydata[!names(mydata) %in% "p13c1"]
mydata <- mydata[!names(mydata) %in% "rs_finance_2a"]
mydata <- mydata[!names(mydata) %in% "q48"]
mydata <- mydata[!names(mydata) %in% "act_sd_4o"]
mydata <- mydata[!names(mydata) %in% "act_sd_5o"]
mydata <- mydata[!names(mydata) %in% "act_sd_6o"]
mydata <- mydata[!names(mydata) %in% "act_sd_7o"]
mydata <- mydata[!names(mydata) %in% "act_sd_8o"]
mydata <- mydata[!names(mydata) %in% "act_sd_9o"]
mydata <- mydata[!names(mydata) %in% "act_sd_10o"]
mydata <- mydata[!names(mydata) %in% "act_sd_11o"]
mydata <- mydata[!names(mydata) %in% "act_sd_12o"]
mydata <- mydata[!names(mydata) %in% "act_sd_13o"]
mydata <- mydata[!names(mydata) %in% "act_sd_14o"]
mydata <- mydata[!names(mydata) %in% "act_sd_15o"]
mydata <- mydata[!names(mydata) %in% "act_sd_16o"]
mydata <- mydata[!names(mydata) %in% "act_sd_17o"]
mydata <- mydata[!names(mydata) %in% "act_sd_18o"]
mydata <- mydata[!names(mydata) %in% "act_sd_19o"]
mydata <- mydata[!names(mydata) %in% "act_sd_20o"]
mydata <- mydata[!names(mydata) %in% "act_sd_21o"]
mydata <- mydata[!names(mydata) %in% "act_sd_22o"]
mydata <- mydata[!names(mydata) %in% "act_sd_23o"]
mydata <- mydata[!names(mydata) %in% "act_sd_24o"]
mydata <- mydata[!names(mydata) %in% "act_sd_1o"]
mydata <- mydata[!names(mydata) %in% "act_sd_2o"]
mydata <- mydata[!names(mydata) %in% "act_sd_3o"]
mydata <- mydata[!names(mydata) %in% "act_wed_4o"]
mydata <- mydata[!names(mydata) %in% "act_wed_5o"]
mydata <- mydata[!names(mydata) %in% "act_wed_6o"]
mydata <- mydata[!names(mydata) %in% "act_wed_7o"]
mydata <- mydata[!names(mydata) %in% "act_wed_8o"]
mydata <- mydata[!names(mydata) %in% "act_wed_9o"]
mydata <- mydata[!names(mydata) %in% "act_wed_10o"]
mydata <- mydata[!names(mydata) %in% "act_wed_11o"]
mydata <- mydata[!names(mydata) %in% "act_wed_12o"]
mydata <- mydata[!names(mydata) %in% "act_wed_13o"]
mydata <- mydata[!names(mydata) %in% "act_wed_14o"]
mydata <- mydata[!names(mydata) %in% "act_wed_15o"]
mydata <- mydata[!names(mydata) %in% "act_wed_16o"]
mydata <- mydata[!names(mydata) %in% "act_wed_17o"]
mydata <- mydata[!names(mydata) %in% "act_wed_18o"]
mydata <- mydata[!names(mydata) %in% "act_wed_19o"]
mydata <- mydata[!names(mydata) %in% "act_wed_20o"]
mydata <- mydata[!names(mydata) %in% "act_wed_21o"]
mydata <- mydata[!names(mydata) %in% "act_wed_22o"]
mydata <- mydata[!names(mydata) %in% "act_wed_23o"]
mydata <- mydata[!names(mydata) %in% "act_wed_24o"]
mydata <- mydata[!names(mydata) %in% "act_wed_1o"]
mydata <- mydata[!names(mydata) %in% "act_wed_2o"]
mydata <- mydata[!names(mydata) %in% "act_wed_3o"]
mydata <- mydata[!names(mydata) %in% "switcher_2016_otro"]
mydata <- mydata[!names(mydata) %in% "switcher_2015_otro"]
mydata <- mydata[!names(mydata) %in% "switcher_2014_otro"]
mydata <- mydata[!names(mydata) %in% "p35a1"]
mydata <- mydata[!names(mydata) %in% "s4p11b_2015"]
mydata <- mydata[!names(mydata) %in% "s4p13c1_2015"]
mydata <- mydata[!names(mydata) %in% "s5p18_2015"]


#'# GPS data: Displace

mydata <- mydata[!names(mydata) %in% "geo_pointsaltitude"]
mydata <- mydata[!names(mydata) %in% "geo_points1"]


#'# Save processed data in Stata and SPSS format

haven::write_dta(mydata, paste0(filename, "_PU.dta"))

mydata <- haven::read_dta(paste0(filename, "_PU.dta"))

colnames(mydata) <- gsub('^_', '', colnames(mydata))
mydata[is.na(mydata)] <- NA
names(mydata)[names(mydata) == "ANEXO"] <- "ANEXO1"
names(mydata)[names(mydata) == "COD_MOD"] <- "cod_mod_spss"
haven::write_sav(mydata, paste0(filename, "_PU.sav"))


# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
