rm(list=ls(all=t))
filename <- "ehsection1_relabelled" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
dropvars <- c("a115_hh_knwglwdge_mem", "a116_expenditure_active")
mydata <- mydata[!names(mydata) %in% dropvars]
# !!! No Direct PII-team
!!!Include relevant variables, but check their population size first to confirm they are <100,000
dropvars <- c("dise")
mydata <- mydata[!names(mydata) %in% dropvars]
locvars <- c("a006_a_block_id", "a007_a_vill_id")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## a006_a_block_id. 006 Block ID
## 1 2 3 4 5 6 7 8 9
## 203 167 192 404 97 190 155 422 528
## [1] "Frequency table after encoding"
## a006_a_block_id. 006 Block ID
## 279 280 281 282 283 284 285 286 287
## 422 167 192 528 404 97 203 155 190
## [1] "Frequency table before encoding"
## a007_a_vill_id. 007 Village ID
## 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 17 18 19 20 21 22
## 16 16 16 15 20 30 28 14 15 15 17 24 24 15 18 21 16 17 18 30 22
## 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 18 17 32 27 26 18 15 15 24 26 22 16 29 19 17 21 27 16 16 18 16
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 28 20 23 21 19 17 17 16 18 26 24 27 18 16 21 13 24 20 16 18 18
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 80 81 82 83 84 85 87
## 29 16 18 21 23 13 16 19 16 23 23 17 22 29 30 16 22 17 17 13 16
## 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 22 15 19 19 19 20 13 17 23 29 21 25 18 24 21 15 19 13 31 14 27
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122
## 21 17 21 27 14 24 20 16 21 22 20 13 10 10
## [1] "Frequency table after encoding"
## a007_a_vill_id. 007 Village ID
## 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
## 18 30 14 22 15 16 16 16 17 17 19 25 23 30 17 10 31 15 22 16 21
## 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
## 22 19 15 17 24 16 24 19 32 17 21 17 16 21 23 18 29 19 22 13 15
## 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
## 17 17 23 27 28 16 24 13 16 19 29 21 16 24 20 27 22 20 23 18 13
## 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
## 16 15 16 13 15 28 18 17 24 29 29 21 16 20 27 19 20 16 15 18 20
## 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
## 15 14 21 20 27 18 24 22 21 24 26 14 23 26 19 17 10 18 16 16 26
## 714 715 716 717 718 719 720 721 722 723 724 725 726 727
## 21 16 18 21 13 13 18 21 17 18 27 30 16 18
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
mydata2 <- top_recode (variable="a106_land_owned", break_point=30, missing=NA)
## [1] "Frequency table before encoding"
## a106_land_owned. 106 Land owned as of day of survey
## 0 0.5 0.980000019073486 1
## 650 1 1 227
## 1.5 2 2.5 3
## 1 286 1 191
## 4 5 6 7
## 119 224 68 61
## 8 9 10 11
## 56 21 101 4
## 12 13 14 15
## 36 8 7 43
## 16 17 18 19
## 6 2 2 1
## 20 22 25 27
## 48 1 19 1
## 30 32 35 40
## 20 2 2 11
## 41 45 50 52
## 1 1 13 1
## 60 80 Not applicable 100
## 2 2 114 1
## 102 152
## 1 1
## [1] "Frequency table after encoding"
## a106_land_owned. 106 Land owned as of day of survey
## 0 0.5 0.980000019073486 1
## 650 1 1 227
## 1.5 2 2.5 3
## 1 286 1 191
## 4 5 6 7
## 119 224 68 61
## 8 9 10 11
## 56 21 101 4
## 12 13 14 15
## 36 8 7 43
## 16 17 18 19
## 6 2 2 1
## 20 22 25 27
## 48 1 19 1
## 30 or more
## 172
mydata2 <- top_recode (variable="a107_a_land_cultivate", break_point=30, missing=NA)
## [1] "Frequency table before encoding"
## a107_a_land_cultivate. 107a Land cultivated including orchard and planation as of day of survey
## 0 0.5 1 1.5 2
## 729 1 211 1 280
## 2.5 3 4 5 6
## 1 183 121 206 67
## 7 8 9 10 11
## 54 61 17 84 3
## 12 13 14 15 16
## 33 9 6 38 6
## 17 18 19 20 22
## 2 3 1 45 1
## 23 25 30 32 35
## 1 12 20 1 3
## 40 45 50 60 80
## 10 2 9 1 3
## Not applicable 100
## 131 2
## [1] "Frequency table after encoding"
## a107_a_land_cultivate. 107a Land cultivated including orchard and planation as of day of survey
## 0 0.5 1 1.5 2 2.5 3
## 729 1 211 1 280 1 183
## 4 5 6 7 8 9 10
## 121 206 67 54 61 17 84
## 11 12 13 14 15 16 17
## 3 33 9 6 38 6 2
## 18 19 20 22 23 25 30 or more
## 3 1 45 1 1 12 182
mydata2 <- top_recode (variable="a107_b_bighas", break_point=15, missing=NA)
## [1] "Frequency table before encoding"
## a107_b_bighas. 107b If yes, land cultivated in bighas
## 0 0.980000019073486 1 2
## 13 2 16 53
## 3 4 5 6
## 39 27 34 12
## 7 8 9 10
## 14 7 3 45
## 12 13 15 17
## 4 2 9 1
## 18 20 25 27
## 2 11 3 1
## 30 40 50 60
## 3 4 3 2
## Not applicable
## 2048
## [1] "Frequency table after encoding"
## a107_b_bighas. 107b If yes, land cultivated in bighas
## 0 0.980000019073486 1 2
## 13 2 16 53
## 3 4 5 6
## 39 27 34 12
## 7 8 9 10
## 14 7 3 45
## 12 13 15 or more
## 4 2 2087
mydata <- top_recode (variable="a109_num_cards", break_point=3, missing=NA)
## [1] "Frequency table before encoding"
## a109_num_cards. 109 How many MGNREG job cards issued to household
## 0 1 2 3 4
## 1 1563 193 37 13
## 5 6 10 Not applicable
## 5 2 1 543
## [1] "Frequency table after encoding"
## a109_num_cards. 109 How many MGNREG job cards issued to household
## 0 1 2 3 or more
## 1 1563 193 601
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("a101_religion",
"a102_languange",
"a104_employ_type",
"a105_hh_important_indus",
"a105_hh_important_indus_oth",
"a110_bank_po_ac",
"a111j_aff_sequestrated",
"a112j_res_sequestrated",
"a112p_res_gen_othr")
capture_tables (indirect_PII)
# Recode those with very specific values.
break_lan <- c(1,2,14)
labels_lan <- c("Hindi" =1,
"Other" =2,
"Marwadi" =3)
mydata <- ordinal_recode (variable="a102_languange", break_points=break_lan, missing=999999, value_labels=labels_lan)
## [1] "Frequency table before encoding"
## a102_languange. 102 What language do you normally speak at home?
## Hindi Bangla Telegu Punjabi Urdu Marwadi Don’t know
## 275 1 1 2 2 2063 14
## recoded
## [1,2) [2,14) [14,1e+06)
## 1 275 0 0
## 3 0 1 0
## 10 0 1 0
## 12 0 2 0
## 13 0 2 0
## 14 0 0 2063
## 98 0 0 14
## [1] "Frequency table after encoding"
## a102_languange. 102 What language do you normally speak at home?
## Hindi Other Marwadi
## 275 6 2077
## [1] "Inspect value labels and relabel as necessary"
## Hindi Other Marwadi
## 1 2 3
break_rel <- c(1,2,3)
labels_rel <- c("Hinduism" =1,
"Islam" =2,
"Other" =3)
mydata2 <- ordinal_recode (variable="a101_religion", break_points=break_rel, missing=999999, value_labels=labels_rel)
## [1] "Frequency table before encoding"
## a101_religion. 101 What is your religion?
## Hinduism Islam Christian Don't know
## 1839 506 2 11
## recoded
## [1,2) [2,3) [3,1e+06)
## 1 1839 0 0
## 2 0 506 0
## 3 0 0 2
## 98 0 0 11
## [1] "Frequency table after encoding"
## a101_religion. 101 What is your religion?
## Hinduism Islam Other
## 1839 506 13
## [1] "Inspect value labels and relabel as necessary"
## Hinduism Islam Other
## 1 2 3
dropvars <- c("a105_hh_important_indus",
"a105_hh_important_indus_oth",
"a111j_aff_sequestrated",
"a112j_res_sequestrated")
mydata <- mydata[!names(mydata) %in% dropvars] # Drop as strong identifier
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('a101_religion', 'a103_social_grp', 'a104_employ_type') ##!!! Replace with candidate categorical demo vars
# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
sdcInitial
## The input dataset consists of 2358 rows and 81 variables.
## --> Categorical key variables: a101_religion, a103_social_grp, a104_employ_type
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## a101_religion 4 (4) 589.500 (589.500) 2
## a103_social_grp 5 (5) 471.600 (471.600) 54
## a104_employ_type 8 (8) 294.750 (294.750) 1
##
## (2)
## (54)
## (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 9 (0.382%)
## - 3-anonymity: 21 (0.891%)
## - 5-anonymity: 36 (1.527%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 9 x 3
## a101_religion a103_social_grp a104_employ_type
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 2 [Islam] 4 [General Class/Caste] 4 [Casual labor in agriculture]
## 2 1 [Hinduism] 2 [Scheduled Caste (SC)] 0 [Unemployed]
## 3 2 [Islam] 2 [Scheduled Caste (SC)] 3 [Regular wage / salary earnin~
## 4 2 [Islam] 1 [Scheduled Tribe (ST)] 5 [Casual labor in non-agricult~
## 5 98 [Don't know] 98 [Don’t know] 3 [Regular wage / salary earnin~
## 6 98 [Don't know] 3 [Other Backward Class/Caste (O~ 3 [Regular wage / salary earnin~
## 7 98 [Don't know] 98 [Don’t know] 4 [Casual labor in agriculture]
## 8 2 [Islam] 1 [Scheduled Tribe (ST)] 2 [Self employed nonagricultura~
## 9 2 [Islam] 4 [General Class/Caste] 6 [Skilled worker]
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## a101_religion a103_social_grp a104_employ_type
## 44 2 4 NA
## 954 1 2 NA
## 1193 2 2 NA
## 1305 2 1 NA
## 1834 98 98 NA
## 2143 98 NA 3
## 2151 98 98 NA
## 2208 2 NA 2
## 2320 2 4 NA
mydata [notAnon,"a104_employ_type"] <- NA
createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
## The input dataset consists of 2358 rows and 81 variables.
## --> Categorical key variables: a101_religion, a103_social_grp, a104_employ_type
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size Size of smallest (>0)
## a101_religion 4 (4) 589.500 (589.500) 2
## a103_social_grp 5 (5) 471.600 (471.600) 54
## a104_employ_type 8 (8) 335.571 (335.571) 2
##
## (2)
## (54)
## (2)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 1 (0.042%)
## - 3-anonymity: 13 (0.551%)
## - 5-anonymity: 23 (0.975%)
##
## ----------------------------------------------------------------------
# !!! No further open-ends
# !!! No GPS data
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)