rm(list=ls(all=t))
filename <- "B_t1_schooling_relabelled" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!!No Direct PII
# !!!No Direct PII-team
# !!!No small locations
# Recode school schedules into groups
table(mydata$t1_b7_school_hrs_begin)
##
## 1:00:00 PM 1:30:00 PM 10:00:00 AM 10:15:00 AM 10:19:00 AM 10:30:00 AM
## 180 15 2 109 1 1 8
## 11:00:00 AM 12:00:00 AM 12:00:00 PM 12:30:00 PM 3:00:00 PM 3:30:00 AM 3:40:00 AM
## 6 5 52 44 1 1 1
## 3:45:00 PM 4:39:00 PM 7:00:00 AM 7:15:00 AM 7:30:00 AM 8:00:00 AM 8:30:00 AM
## 1 1 23 2 24 23 9
## 8:45:00 AM 9:00:00 AM 9:01:00 AM 9:05:00 AM 9:10:00 AM 9:15:00 AM 9:20:00 AM
## 1 766 3 4 5 17 3
## 9:25:00 AM 9:30:00 AM 9:31:00 AM 9:35:00 AM 9:40:00 AM 9:45:00 AM 9:50:00 AM
## 2 1081 1 12 43 11 1
mydata$t1_b7_school_hrs_begin <- labelled(mydata$t1_b7_school_hrs_begin, c("1:00:00 PM" ="1:00:00 PM" ,
"1:30:00 PM"="1:30:00 PM",
"10:00:00 AM"="10:00:00 AM",
"10:15:00 AM"="10:15:00 AM",
"10:19:00 AM"="10:19:00 AM",
"10:30:00 AM"="10:30:00 AM",
"11:00:00 AM"="11:00:00 AM",
"12:00:00 AM"="12:00:00 AM",
"12:00:00 PM"="12:00:00 PM",
"12:30:00 PM"="12:30:00 PM",
"3:00:00 PM"="3:00:00 PM",
"3:30:00 AM"="3:30:00 AM",
"3:40:00 AM"="3:40:00 AM",
"3:45:00 PM"="3:45:00 PM",
"4:39:00 PM"="4:39:00 PM",
"7:00:00 AM"="7:00:00 AM",
"7:15:00 AM"="7:15:00 AM",
"7:30:00 AM"="7:30:00 AM",
"8:00:00 AM"="8:00:00 AM",
"8:30:00 AM"="8:30:00 AM",
"8:45:00 AM"="8:45:00 AM",
"9:00:00 AM"="9:00:00 AM",
"9:01:00 AM"="9:01:00 AM",
"9:05:00 AM"="9:05:00 AM",
"9:10:00 AM"="9:10:00 AM",
"9:15:00 AM"="9:15:00 AM",
"9:20:00 AM"="9:20:00 AM",
"9:25:00 AM"="9:25:00 AM",
"9:30:00 AM"="9:30:00 AM",
"9:31:00 AM"="9:31:00 AM",
"9:35:00 AM"="9:35:00 AM",
"9:40:00 AM"="9:40:00 AM",
"9:45:00 AM"="9:45:00 AM",
"9:50:00 AM"="9:50:00 AM"))
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "12:00:00 AM"] <- "12:00:00-7:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "3:30:00 AM"] <- "12:00:00-7:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "3:40:00 AM"] <- "12:00:00-7:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "7:00:00 AM"] <- "12:00:00-7:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "7:15:00 AM"] <- "7:01:00-8:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "7:30:00 AM"] <- "7:01:00-8:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "8:00:00 AM"] <- "7:01:00-8:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "8:30:00 AM"] <- "8:01:00-9:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "8:45:00 AM"] <- "8:01:00-9:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:00:00 AM"] <- "8:01:00-9:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:01:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:05:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:10:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:15:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:20:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:25:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:30:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:31:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:35:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:40:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:45:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "9:50:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "10:00:00 AM"] <- "9:01:00-10:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "10:15:00 AM"] <- "10:01:00-11:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "10:19:00 AM"] <- "10:01:00-11:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "10:30:00 AM"] <- "10:01:00-11:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "11:00:00 AM"] <- "10:01:00-11:00:00 AM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "12:00:00 PM"] <- "11:01:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "12:30:00 PM"] <- "12:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "1:00:00 PM"] <- "12:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "1:30:00 PM"] <- "12:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "3:00:00 PM"] <- "12:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "3:45:00 PM"] <- "12:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_begin[mydata$t1_b7_school_hrs_begin == "4:39:00 PM"] <- "12:01:00-5:00:00 PM"
table(mydata$t1_b7_school_hrs_begin)
##
## 10:01:00-11:00:00 AM 11:01:00 AM-12:00:00 PM
## 180 16 52
## 12:00:00-7:00:00 AM 12:01:00-5:00:00 PM 7:01:00-8:00:00 AM
## 30 64 49
## 8:01:00-9:00:00 AM 9:01:00-10:00:00 AM
## 776 1292
table(mydata$t1_b7_school_hrs_end)
##
## 1:00:00 PM 1:30:00 PM 1:40:00 PM 10:00:00 AM 10:19:00 AM 11:00:00 AM
## 180 25 7 1 1 1 3
## 11:30:00 AM 11:45:00 AM 12:00:00 AM 12:00:00 PM 12:30:00 PM 2:00:00 PM 2:30:00 PM
## 2 2 5 51 54 20 10
## 2:40:00 PM 2:45:00 PM 3:00:00 PM 3:05:00 PM 3:10:00 PM 3:15:00 PM 3:20:00 PM
## 1 1 359 2 7 10 4
## 3:30:00 PM 3:31:00 PM 3:35:00 PM 3:40:00 PM 3:41:00 PM 3:43:00 PM 3:45:00 PM
## 960 1 9 265 1 1 44
## 4:00:00 PM 4:10:00 PM 4:15:00 PM 4:30:00 PM 4:39:00 PM 4:40:00 PM 4:45:00 PM
## 311 1 4 33 1 1 1
## 5:00:00 PM 5:30:00 PM 5:40:00 PM 5:45:00 PM 6:00:00 PM 6:30:00 PM 9:30:00 AM
## 30 35 2 1 9 1 1
## 9:30:00 PM
## 1
mydata$t1_b7_school_hrs_end <- labelled(mydata$t1_b7_school_hrs_end, c( "12:00:00 AM"="12:00:00 AM",
"9:30:00 AM"="9:30:00 AM",
"10:00:00 AM"="10:00:00 AM",
"10:19:00 AM"="10:19:00 AM",
"11:00:00 AM"="11:00:00 AM",
"11:30:00 AM"="11:30:00 AM",
"11:45:00 AM"="11:45:00 AM",
"12:00:00 PM"="12:00:00 PM",
"12:30:00 PM"="12:30:00 PM",
"1:00:00 PM" ="1:00:00 PM" ,
"1:30:00 PM"="1:30:00 PM",
"1:40:00 PM"="1:40:00 PM",
"2:00:00 PM"="2:00:00 PM",
"2:30:00 PM"="2:30:00 PM",
"2:40:00 PM"="2:40:00 PM",
"2:45:00 PM"="2:45:00 PM",
"3:00:00 PM"="3:00:00 PM",
"3:05:00 PM"="3:05:00 PM",
"3:10:00 PM"="3:10:00 PM",
"3:15:00 PM"="3:15:00 PM",
"3:20:00 PM"="3:20:00 PM",
"3:30:00 PM"="3:30:00 PM",
"3:31:00 PM"="3:31:00 PM",
"3:35:00 PM"="3:35:00 PM",
"3:40:00 PM"="3:40:00 PM",
"3:41:00 PM"="3:41:00 PM",
"3:43:00 PM"="3:43:00 PM",
"3:45:00 PM"="3:45:00 PM",
"4:00:00 PM"="4:00:00 PM",
"4:10:00 PM"="4:10:00 PM",
"4:15:00 PM"="4:15:00 PM",
"4:30:00 PM"="4:30:00 PM",
"4:39:00 PM"="4:39:00 PM",
"4:40:00 PM"="4:40:00 PM",
"4:45:00 PM"="4:45:00 PM",
"5:00:00 PM"="5:00:00 PM",
"5:30:00 PM"="5:30:00 PM",
"5:40:00 PM"="5:40:00 PM",
"5:45:00 PM"="5:45:00 PM",
"6:00:00 PM"="6:00:00 PM",
"6:30:00 PM"="6:30:00 PM",
"9:30:00 PM"="9:30:00 PM"))
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end== "12:00:00 AM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "9:30:00 AM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "10:00:00 AM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "10:19:00 AM"] <-"12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "11:00:00 AM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "11:30:00 AM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "11:45:00 AM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "12:00:00 PM"] <- "12:00:00 AM-12:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "12:30:00 PM"] <- "12:01:00-1:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "1:00:00 PM"] <- "12:01:00-1:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "1:30:00 PM"] <- "1:01:00-2:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "1:40:00 PM"] <- "1:01:00-2:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "2:00:00 PM"] <- "1:01:00-2:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "2:00:00 PM"] <- "2:01:00-3:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "2:30:00 PM"] <- "2:01:00-3:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "2:40:00 PM"] <- "2:01:00-3:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "2:45:00 PM"] <- "2:01:00-3:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:00:00 PM"] <- "2:01:00-3:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:05:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:10:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:15:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:20:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:30:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:31:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:35:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:40:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:41:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:43:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "3:45:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:00:00 PM"] <- "3:01:00-4:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:10:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:15:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:30:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:39:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:40:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "4:45:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "5:00:00 PM"] <- "4:01:00-5:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "5:30:00 PM"] <- "5:01:00-10:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "5:40:00 PM"] <- "5:01:00-10:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "5:45:00 PM"] <- "5:01:00-10:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "6:00:00 PM"] <- "5:01:00-10:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "6:30:00 PM"] <- "5:01:00-10:00:00 PM"
mydata$t1_b7_school_hrs_end[mydata$t1_b7_school_hrs_end == "9:30:00 PM"] <- "5:01:00-10:00:00 PM"
table(mydata$t1_b7_school_hrs_end)
##
## 1:01:00-2:00:00 PM 12:00:00 AM-12:00:00 PM
## 180 28 66
## 12:01:00-1:00:00 PM 2:01:00-3:00:00 PM 3:01:00-4:00:00 PM
## 79 371 1615
## 4:01:00-5:00:00 PM 5:01:00-10:00:00 PM
## 71 49
# Recode education attainment of adults to reduce risk of re-identification
# !!!No education variables
# Top code household composition variables with large and unusual numbers
# !!!No household composition variables
# Top code high income to the 99.5 percentile
# !!!No income variables
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("t1_b1_attend_school",
"t1_b3a_same_school",
"t1_b4_school_identical")
capture_tables (indirect_PII)
# Recode those with very specific values.
# !!!Insufficient demographic data
# !!! Identify open-end variables here:
open_ends <- c("t1_b2_other_class")
report_open (list_open_ends = open_ends)
# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number
dropvars <- c("t1_b2_other_class")
mydata <- mydata[!names(mydata) %in% dropvars] # Drop as actually verbatim data in Hindi
# !!!No GPS data
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)