rm(list=ls(all=t))

Setup filenames

filename <- "ehsection1_relabelled" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

Setup data, functions and create dictionary for dataset review

source (functions_vers)

Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:

# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

Direct PII: variables to be removed

dropvars <- c("a115_hh_knwglwdge_mem", "a116_expenditure_active") 
mydata <- mydata[!names(mydata) %in% dropvars]

Direct PII-team: Encode field team names

# !!! No Direct PII-team

Small locations: Encode locations with pop <100,000 using random large numbers

!!!Include relevant variables, but check their population size first to confirm they are <100,000

dropvars <- c("dise") 
mydata <- mydata[!names(mydata) %in% dropvars]

locvars <- c("a006_a_block_id", "a007_a_vill_id") 
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## a006_a_block_id. 006 Block ID
##   1   2   3   4   5   6   7   8   9 
## 203 167 192 404  97 190 155 422 528 
## [1] "Frequency table after encoding"
## a006_a_block_id. 006 Block ID
## 279 280 281 282 283 284 285 286 287 
## 422 167 192 528 404  97 203 155 190 
## [1] "Frequency table before encoding"
## a007_a_vill_id. 007 Village ID
##   1   2   3   4   5   6   7   8   9  10  11  12  13  15  16  17  18  19  20  21  22 
##  16  16  16  15  20  30  28  14  15  15  17  24  24  15  18  21  16  17  18  30  22 
##  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43 
##  18  17  32  27  26  18  15  15  24  26  22  16  29  19  17  21  27  16  16  18  16 
##  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64 
##  28  20  23  21  19  17  17  16  18  26  24  27  18  16  21  13  24  20  16  18  18 
##  65  66  67  68  69  70  71  72  73  74  75  76  77  78  80  81  82  83  84  85  87 
##  29  16  18  21  23  13  16  19  16  23  23  17  22  29  30  16  22  17  17  13  16 
##  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##  22  15  19  19  19  20  13  17  23  29  21  25  18  24  21  15  19  13  31  14  27 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 
##  21  17  21  27  14  24  20  16  21  22  20  13  10  10 
## [1] "Frequency table after encoding"
## a007_a_vill_id. 007 Village ID
## 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 
##  18  30  14  22  15  16  16  16  17  17  19  25  23  30  17  10  31  15  22  16  21 
## 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 
##  22  19  15  17  24  16  24  19  32  17  21  17  16  21  23  18  29  19  22  13  15 
## 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 
##  17  17  23  27  28  16  24  13  16  19  29  21  16  24  20  27  22  20  23  18  13 
## 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 
##  16  15  16  13  15  28  18  17  24  29  29  21  16  20  27  19  20  16  15  18  20 
## 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 
##  15  14  21  20  27  18  24  22  21  24  26  14  23  26  19  17  10  18  16  16  26 
## 714 715 716 717 718 719 720 721 722 723 724 725 726 727 
##  21  16  18  21  13  13  18  21  17  18  27  30  16  18

Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values

# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

mydata2 <- top_recode (variable="a106_land_owned", break_point=30, missing=NA)
## [1] "Frequency table before encoding"
## a106_land_owned. 106 Land owned as of day of survey
##                 0               0.5 0.980000019073486                 1 
##               650                 1                 1               227 
##               1.5                 2               2.5                 3 
##                 1               286                 1               191 
##                 4                 5                 6                 7 
##               119               224                68                61 
##                 8                 9                10                11 
##                56                21               101                 4 
##                12                13                14                15 
##                36                 8                 7                43 
##                16                17                18                19 
##                 6                 2                 2                 1 
##                20                22                25                27 
##                48                 1                19                 1 
##                30                32                35                40 
##                20                 2                 2                11 
##                41                45                50                52 
##                 1                 1                13                 1 
##                60                80    Not applicable               100 
##                 2                 2               114                 1 
##               102               152 
##                 1                 1

## [1] "Frequency table after encoding"
## a106_land_owned. 106 Land owned as of day of survey
##                 0               0.5 0.980000019073486                 1 
##               650                 1                 1               227 
##               1.5                 2               2.5                 3 
##                 1               286                 1               191 
##                 4                 5                 6                 7 
##               119               224                68                61 
##                 8                 9                10                11 
##                56                21               101                 4 
##                12                13                14                15 
##                36                 8                 7                43 
##                16                17                18                19 
##                 6                 2                 2                 1 
##                20                22                25                27 
##                48                 1                19                 1 
##        30 or more 
##               172

mydata2 <- top_recode (variable="a107_a_land_cultivate", break_point=30, missing=NA)
## [1] "Frequency table before encoding"
## a107_a_land_cultivate. 107a Land cultivated including orchard and planation as of day of survey
##              0            0.5              1            1.5              2 
##            729              1            211              1            280 
##            2.5              3              4              5              6 
##              1            183            121            206             67 
##              7              8              9             10             11 
##             54             61             17             84              3 
##             12             13             14             15             16 
##             33              9              6             38              6 
##             17             18             19             20             22 
##              2              3              1             45              1 
##             23             25             30             32             35 
##              1             12             20              1              3 
##             40             45             50             60             80 
##             10              2              9              1              3 
## Not applicable            100 
##            131              2

## [1] "Frequency table after encoding"
## a107_a_land_cultivate. 107a Land cultivated including orchard and planation as of day of survey
##          0        0.5          1        1.5          2        2.5          3 
##        729          1        211          1        280          1        183 
##          4          5          6          7          8          9         10 
##        121        206         67         54         61         17         84 
##         11         12         13         14         15         16         17 
##          3         33          9          6         38          6          2 
##         18         19         20         22         23         25 30 or more 
##          3          1         45          1          1         12        182

mydata2 <- top_recode (variable="a107_b_bighas", break_point=15, missing=NA)
## [1] "Frequency table before encoding"
## a107_b_bighas. 107b If yes, land cultivated in bighas
##                 0 0.980000019073486                 1                 2 
##                13                 2                16                53 
##                 3                 4                 5                 6 
##                39                27                34                12 
##                 7                 8                 9                10 
##                14                 7                 3                45 
##                12                13                15                17 
##                 4                 2                 9                 1 
##                18                20                25                27 
##                 2                11                 3                 1 
##                30                40                50                60 
##                 3                 4                 3                 2 
##    Not applicable 
##              2048

## [1] "Frequency table after encoding"
## a107_b_bighas. 107b If yes, land cultivated in bighas
##                 0 0.980000019073486                 1                 2 
##                13                 2                16                53 
##                 3                 4                 5                 6 
##                39                27                34                12 
##                 7                 8                 9                10 
##                14                 7                 3                45 
##                12                13        15 or more 
##                 4                 2              2087

mydata <- top_recode (variable="a109_num_cards", break_point=3, missing=NA)
## [1] "Frequency table before encoding"
## a109_num_cards. 109 How many MGNREG job cards issued to household 
##              0              1              2              3              4 
##              1           1563            193             37             13 
##              5              6             10 Not applicable 
##              5              2              1            543

## [1] "Frequency table after encoding"
## a109_num_cards. 109 How many MGNREG job cards issued to household 
##         0         1         2 3 or more 
##         1      1563       193       601

Indirect PII - Categorical: Recode, encode, or Top/bottom coding for extreme values

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("a101_religion", 
                  "a102_languange", 
                  "a104_employ_type", 
                  "a105_hh_important_indus", 
                  "a105_hh_important_indus_oth", 
                  "a110_bank_po_ac", 
                  "a111j_aff_sequestrated", 
                  "a112j_res_sequestrated", 
                  "a112p_res_gen_othr")
capture_tables (indirect_PII)

# Recode those with very specific values. 

break_lan <- c(1,2,14)
labels_lan <- c("Hindi" =1, 
                "Other" =2, 
                "Marwadi" =3)
mydata <- ordinal_recode (variable="a102_languange", break_points=break_lan, missing=999999, value_labels=labels_lan)

## [1] "Frequency table before encoding"
## a102_languange. 102 What language do you normally speak at home? 
##      Hindi     Bangla     Telegu    Punjabi       Urdu    Marwadi Don’t know 
##        275          1          1          2          2       2063         14 
##     recoded
##      [1,2) [2,14) [14,1e+06)
##   1    275      0          0
##   3      0      1          0
##   10     0      1          0
##   12     0      2          0
##   13     0      2          0
##   14     0      0       2063
##   98     0      0         14
## [1] "Frequency table after encoding"
## a102_languange. 102 What language do you normally speak at home? 
##   Hindi   Other Marwadi 
##     275       6    2077 
## [1] "Inspect value labels and relabel as necessary"
##   Hindi   Other Marwadi 
##       1       2       3
break_rel <- c(1,2,3)
labels_rel <- c("Hinduism" =1, 
                "Islam" =2, 
                "Other" =3)
mydata2 <- ordinal_recode (variable="a101_religion", break_points=break_rel, missing=999999, value_labels=labels_rel)

## [1] "Frequency table before encoding"
## a101_religion. 101 What is your religion? 
##   Hinduism      Islam  Christian Don't know 
##       1839        506          2         11 
##     recoded
##      [1,2) [2,3) [3,1e+06)
##   1   1839     0         0
##   2      0   506         0
##   3      0     0         2
##   98     0     0        11
## [1] "Frequency table after encoding"
## a101_religion. 101 What is your religion? 
## Hinduism    Islam    Other 
##     1839      506       13 
## [1] "Inspect value labels and relabel as necessary"
## Hinduism    Islam    Other 
##        1        2        3
dropvars <- c("a105_hh_important_indus", 
              "a105_hh_important_indus_oth", 
              "a111j_aff_sequestrated", 
              "a112j_res_sequestrated")
mydata <- mydata[!names(mydata) %in% dropvars] # Drop as strong identifier

Matching and crosstabulations: Run automated PII check

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('a101_religion', 'a103_social_grp', 'a104_employ_type') ##!!! Replace with candidate categorical demo vars

# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, 
                           keyVars = selectedKeyVars, 
                           hhId = selectedHouseholdID)
                           
sdcInitial
## The input dataset consists of 2358 rows and 81 variables.
##   --> Categorical key variables: a101_religion, a103_social_grp, a104_employ_type
##   --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##      Key Variable Number of categories     Mean size           Size of smallest (>0)
##     a101_religion                    4 (4)   589.500 (589.500)                     2
##   a103_social_grp                    5 (5)   471.600 (471.600)                    54
##  a104_employ_type                    8 (8)   294.750 (294.750)                     1
##      
##   (2)
##  (54)
##   (1)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 9 (0.382%)
##   - 3-anonymity: 21 (0.891%)
##   - 5-anonymity: 36 (1.527%)
## 
## ----------------------------------------------------------------------

Show values of key variable of records that violate k-anonymity

#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 9 x 3
##     a101_religion                    a103_social_grp                 a104_employ_type
##         <dbl+lbl>                          <dbl+lbl>                        <dbl+lbl>
## 1  2 [Islam]       4 [General Class/Caste]           4 [Casual labor in agriculture] 
## 2  1 [Hinduism]    2 [Scheduled Caste (SC)]          0 [Unemployed]                  
## 3  2 [Islam]       2 [Scheduled Caste (SC)]          3 [Regular wage / salary earnin~
## 4  2 [Islam]       1 [Scheduled Tribe (ST)]          5 [Casual labor in non-agricult~
## 5 98 [Don't know] 98 [Don’t know]                    3 [Regular wage / salary earnin~
## 6 98 [Don't know]  3 [Other Backward Class/Caste (O~ 3 [Regular wage / salary earnin~
## 7 98 [Don't know] 98 [Don’t know]                    4 [Casual labor in agriculture] 
## 8  2 [Islam]       1 [Scheduled Tribe (ST)]          2 [Self employed nonagricultura~
## 9  2 [Islam]       4 [General Class/Caste]           6 [Skilled worker]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
##      a101_religion a103_social_grp a104_employ_type
## 44               2               4               NA
## 954              1               2               NA
## 1193             2               2               NA
## 1305             2               1               NA
## 1834            98              98               NA
## 2143            98              NA                3
## 2151            98              98               NA
## 2208             2              NA                2
## 2320             2               4               NA
mydata [notAnon,"a104_employ_type"] <- NA

createSdcObj(dat = mydata, 
             keyVars = selectedKeyVars, 
             hhId = selectedHouseholdID)
## The input dataset consists of 2358 rows and 81 variables.
##   --> Categorical key variables: a101_religion, a103_social_grp, a104_employ_type
##   --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
## 
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
##      Key Variable Number of categories     Mean size           Size of smallest (>0)
##     a101_religion                    4 (4)   589.500 (589.500)                     2
##   a103_social_grp                    5 (5)   471.600 (471.600)                    54
##  a104_employ_type                    8 (8)   335.571 (335.571)                     2
##      
##   (2)
##  (54)
##   (2)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
## 
## Number of observations violating
##   - 2-anonymity: 1 (0.042%)
##   - 3-anonymity: 13 (0.551%)
##   - 5-anonymity: 23 (0.975%)
## 
## ----------------------------------------------------------------------

Open-ends: review responses for any sensitive information, redact as necessary

# !!! No further open-ends

GPS data: Displace

# !!! No GPS data

Save processed data in Stata and SPSS format

Adds "_PU" (Public Use) to the end of the name

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)