rm(list=ls(all=t))
filename <- "ehsection2_relabelled" # !!!Update filename
functions_vers <- "functions_1.7.R" # !!!Update helper functions file
source (functions_vers)
Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags:
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition.
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000)
# Large Location (>100,000)
# Weight: weightVar
# Household ID: hhId,
# Open-ends: Review responses for any sensitive information, redact as necessary
# !!! No Direct PII
# !!! No Direct PII-team
!!!Include relevant variables, but check their population size first to confirm they are <100,000
dropvars <- c("dise")
mydata <- mydata[!names(mydata) %in% dropvars]
locvars <- c("a006_a_block_id", "a007_a_vill_id")
mydata <- encode_location (variables= locvars, missing=999999)
## [1] "Frequency table before encoding"
## a006_a_block_id. 006 Block ID
## 1 2 3 4 5 6 7 8 9
## 1310 1079 1288 2714 561 1284 1072 2835 3631
## [1] "Frequency table after encoding"
## a006_a_block_id. 006 Block ID
## 279 280 281 282 283 284 285 286 287
## 1310 2835 1079 1284 1288 3631 1072 561 2714
## [1] "Frequency table before encoding"
## a007_a_vill_id. 007 Village ID
## 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 17 18 19 20 21 22
## 131 111 101 114 141 204 182 90 84 93 103 162 141 79 101 151 96 108 109 205 139
## 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 117 125 237 192 176 138 103 97 174 178 132 112 245 138 111 131 176 105 107 103 126
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 196 146 147 113 104 111 102 110 136 194 187 168 137 106 136 79 184 145 91 121 113
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 80 81 82 83 84 85 87
## 188 109 160 125 159 94 92 121 114 160 131 124 155 191 191 96 168 91 110 80 119
## 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 157 89 122 113 118 136 77 115 158 172 138 199 104 167 140 91 132 102 237 92 159
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122
## 138 112 146 187 96 159 138 90 131 145 124 96 52 70
## [1] "Frequency table after encoding"
## a007_a_vill_id. 007 Village ID
## 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
## 109 131 79 103 191 104 126 113 191 101 145 136 136 176 187 102 122 138 102 131 237
## 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
## 112 52 162 96 237 196 103 94 192 107 176 103 111 96 204 138 199 96 158 155 119
## 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
## 131 93 90 97 105 136 124 113 111 91 188 110 90 108 125 194 111 146 182 178 104
## 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
## 138 139 106 157 172 132 145 118 138 159 92 160 245 184 113 159 70 121 141 101 187
## 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
## 125 124 91 174 160 115 159 109 205 168 121 146 96 132 77 137 112 140 151 89 114
## 714 715 716 717 718 719 720 721 722 723 724 725 726 727
## 114 167 110 168 91 92 138 79 117 131 147 84 141 80
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less.
mydata <- top_recode (variable="a203_age_", break_point=80, missing=NA)
## [1] "Frequency table before encoding"
## a203_age_. 203 How old is [Name]?
## 1 2 3 4 5 6 7
## 208 144 179 208 257 226 347
## 8 9 10 11 12 13 14
## 361 351 635 526 1005 1042 822
## 15 16 17 18 19 20 21
## 722 582 459 537 293 319 126
## 22 23 24 25 26 27 28
## 164 108 58 159 55 46 95
## 29 30 31 32 33 34 35
## 37 357 42 220 135 107 742
## 36 37 38 39 40 41 42
## 135 117 311 80 815 52 169
## 43 44 45 46 47 48 49
## 84 53 489 40 42 69 17
## 50 51 52 53 54 55 56
## 276 9 41 27 23 139 26
## 57 58 59 60 61 62 63
## 17 48 10 275 12 46 33
## 64 65 66 67 68 69 70
## 13 159 10 9 24 4 174
## 71 72 73 74 75 76 77
## 4 16 10 4 61 2 4
## 78 79 80 81 82 83 85
## 3 1 64 1 8 1 21
## 86 87 88 89 90 92 93
## 1 2 1 1 15 4 1
## 95 96 97 Don't know 99 100
## 6 1 1 5 2 12
## [1] "Frequency table after encoding"
## a203_age_. 203 How old is [Name]?
## 1 2 3 4 5 6 7
## 208 144 179 208 257 226 347
## 8 9 10 11 12 13 14
## 361 351 635 526 1005 1042 822
## 15 16 17 18 19 20 21
## 722 582 459 537 293 319 126
## 22 23 24 25 26 27 28
## 164 108 58 159 55 46 95
## 29 30 31 32 33 34 35
## 37 357 42 220 135 107 742
## 36 37 38 39 40 41 42
## 135 117 311 80 815 52 169
## 43 44 45 46 47 48 49
## 84 53 489 40 42 69 17
## 50 51 52 53 54 55 56
## 276 9 41 27 23 139 26
## 57 58 59 60 61 62 63
## 17 48 10 275 12 46 33
## 64 65 66 67 68 69 70
## 13 159 10 9 24 4 174
## 71 72 73 74 75 76 77
## 4 16 10 4 61 2 4
## 78 79 80 or more
## 3 1 147
mydata <- top_recode (variable="a210_income_", break_point=8000, missing=NA)
## [1] "Frequency table before encoding"
## a210_income_. 210 How much income in cash or in kind did [Name] earn from employment in the la
## 0 1 2 5 6
## 1930 1 1 1 1
## 7 8 10 14 21
## 3 1 3 1 1
## 24 28 35 49 50
## 1 1 3 2 2
## 54 56 60 63 65
## 1 1 3 2 1
## 70 75 80 84 Don't know
## 10 1 1 1 171
## 99 100 105 120 122
## 15 25 3 1 1
## 123 125 130 140 142
## 1 1 1 5 1
## 150 160 170 171 175
## 16 3 1 1 6
## 180 182 188 196 200
## 1 2 1 1 39
## 201 210 215 220 225
## 2 3 1 1 2
## 227 231 238 250 264
## 1 1 4 10 3
## 265 270 280 300 320
## 1 1 1 38 1
## 325 340 350 357 360
## 2 1 43 2 1
## 375 400 420 450 462
## 3 61 4 9 1
## 480 486 490 500 525
## 1 2 2 66 1
## 540 550 560 571 588
## 1 3 11 4 3
## 600 625 630 640 650
## 92 3 3 1 9
## 668 696 700 714 720
## 1 1 187 5 1
## 735 750 770 780 798
## 1 29 1 1 1
## 800 804 830 835 840
## 43 2 1 1 6
## 850 857 875 900 918
## 1 8 2 32 1
## 920 931 945 950 960
## 1 4 1 5 1
## 980 996 Not applicable 1000 1042
## 4 1 10622 99 1
## 1050 1090 1100 1140 1142
## 61 1 10 1 8
## 1143 1150 1160 1162 1169
## 1 1 1 13 2
## 1190 1200 1225 1250 1260
## 1 108 3 28 1
## 1280 1285 1296 1300 1305
## 1 5 1 3 1
## 1379 1398 1400 1420 1428
## 2 2 383 1 8
## 1450 1460 1470 1480 1500
## 12 1 2 1 118
## 1512 1520 1560 1575 1600
## 1 1 1 1 19
## 1631 1632 1633 1650 1700
## 5 1 1 7 8
## 1750 1800 1820 1862 1869
## 73 55 2 2 1
## 1875 1890 1900 1950 2000
## 1 1 1 1 103
## 2007 2025 2050 2100 2140
## 1 1 2 296 1
## 2142 2150 2200 2245 2250
## 4 5 7 1 4
## 2285 2300 2310 2331 2333
## 1 3 3 2 2
## 2350 2400 2450 2500 2525
## 2 28 24 63 1
## 2547 2598 2600 2625 2630
## 1 1 4 1 1
## 2695 2700 2730 2750 2800
## 1 5 1 5 102
## 2857 3000 3150 3200 3262
## 1 72 3 2 1
## 3300 3360 3500 3550 3600
## 1 1 151 1 10
## 3750 3800 3850 4000 4200
## 1 2 6 29 31
## 4500 4662 4666 4700 4900
## 6 3 1 2 5
## 5000 5250 5350 5450 5600
## 16 1 1 1 2
## 5714 6000 6050 6600 6800
## 1 8 1 1 1
## 6996 7000 7142 8000 8162
## 1 10 1 3 1
## 8400 9500 10000 12000 12500
## 2 1 4 3 1
## 13600 14000 15000 17500 20000
## 1 3 1 1 1
## 30000 40000 47000 50000 70000
## 1 1 3 1 1
## [1] "Frequency table after encoding"
## a210_income_. 210 How much income in cash or in kind did [Name] earn from employment in the la
## 0 1 2 5 6
## 1930 1 1 1 1
## 7 8 10 14 21
## 3 1 3 1 1
## 24 28 35 49 50
## 1 1 3 2 2
## 54 56 60 63 65
## 1 1 3 2 1
## 70 75 80 84 Don't know
## 10 1 1 1 171
## 99 100 105 120 122
## 15 25 3 1 1
## 123 125 130 140 142
## 1 1 1 5 1
## 150 160 170 171 175
## 16 3 1 1 6
## 180 182 188 196 200
## 1 2 1 1 39
## 201 210 215 220 225
## 2 3 1 1 2
## 227 231 238 250 264
## 1 1 4 10 3
## 265 270 280 300 320
## 1 1 1 38 1
## 325 340 350 357 360
## 2 1 43 2 1
## 375 400 420 450 462
## 3 61 4 9 1
## 480 486 490 500 525
## 1 2 2 66 1
## 540 550 560 571 588
## 1 3 11 4 3
## 600 625 630 640 650
## 92 3 3 1 9
## 668 696 700 714 720
## 1 1 187 5 1
## 735 750 770 780 798
## 1 29 1 1 1
## 800 804 830 835 840
## 43 2 1 1 6
## 850 857 875 900 918
## 1 8 2 32 1
## 920 931 945 950 960
## 1 4 1 5 1
## 980 996 Not applicable 1000 1042
## 4 1 10622 99 1
## 1050 1090 1100 1140 1142
## 61 1 10 1 8
## 1143 1150 1160 1162 1169
## 1 1 1 13 2
## 1190 1200 1225 1250 1260
## 1 108 3 28 1
## 1280 1285 1296 1300 1305
## 1 5 1 3 1
## 1379 1398 1400 1420 1428
## 2 2 383 1 8
## 1450 1460 1470 1480 1500
## 12 1 2 1 118
## 1512 1520 1560 1575 1600
## 1 1 1 1 19
## 1631 1632 1633 1650 1700
## 5 1 1 7 8
## 1750 1800 1820 1862 1869
## 73 55 2 2 1
## 1875 1890 1900 1950 2000
## 1 1 1 1 103
## 2007 2025 2050 2100 2140
## 1 1 2 296 1
## 2142 2150 2200 2245 2250
## 4 5 7 1 4
## 2285 2300 2310 2331 2333
## 1 3 3 2 2
## 2350 2400 2450 2500 2525
## 2 28 24 63 1
## 2547 2598 2600 2625 2630
## 1 1 4 1 1
## 2695 2700 2730 2750 2800
## 1 5 1 5 102
## 2857 3000 3150 3200 3262
## 1 72 3 2 1
## 3300 3360 3500 3550 3600
## 1 1 151 1 10
## 3750 3800 3850 4000 4200
## 1 2 6 29 31
## 4500 4662 4666 4700 4900
## 6 3 1 2 5
## 5000 5250 5350 5450 5600
## 16 1 1 1 2
## 5714 6000 6050 6600 6800
## 1 8 1 1 1
## 6996 7000 7142 8000 or more
## 1 10 1 29
mydata <- top_recode (variable="a213_amt_paid_", break_point=15000, missing=NA)
## [1] "Frequency table before encoding"
## a213_amt_paid_. 213 How much did you have to pay in fees to [Name]’s school in order to enroll h
## 0 10 20 25 30
## 3289 3 1 1 4
## 40 50 52 60 70
## 1 62 1 14 4
## 75 80 Don't know 100 110
## 1 2 151 307 2
## 115 120 125 130 140
## 1 4 4 9 2
## 150 160 170 180 200
## 73 12 15 8 282
## 210 220 240 250 255
## 23 6 3 79 11
## 270 275 300 310 315
## 1 2 81 1 2
## 320 325 340 350 355
## 6 2 1 42 2
## 360 365 370 375 380
## 28 13 5 5 4
## 390 400 405 410 415
## 3 65 1 1 2
## 425 430 450 460 465
## 2 1 27 9 2
## 470 475 480 485 490
## 3 8 10 2 1
## 495 500 510 515 520
## 1 174 2 2 6
## 525 540 550 560 565
## 11 7 19 11 3
## 570 580 595 600 608
## 8 5 6 81 1
## 610 615 620 625 630
## 1 1 3 6 3
## 635 640 650 660 665
## 2 2 21 5 4
## 670 675 700 710 715
## 1 2 55 1 1
## 720 725 730 750 760
## 5 1 1 17 2
## 770 775 800 850 852
## 3 3 17 3 1
## 860 865 900 915 945
## 1 1 11 1 1
## 950 960 990 Not applicable 1000
## 2 1 1 9802 49
## 1070 1100 1150 1200 1230
## 1 13 1 16 1
## 1300 1350 1375 1400 1440
## 2 2 1 3 3
## 1450 1500 1600 1615 1700
## 2 24 2 1 3
## 1800 1900 2000 2200 2350
## 6 1 29 2 2
## 2400 2500 2800 2900 3000
## 8 16 5 1 69
## 3500 3600 4000 4100 4200
## 13 14 35 1 1
## 4500 4600 4800 5000 5400
## 12 1 7 57 5
## 5500 5800 6000 6300 6500
## 12 1 59 1 6
## 6600 6700 7000 7080 7200
## 2 2 34 1 12
## 7500 7600 8000 8400 8500
## 3 2 32 2 1
## 8600 9000 9500 9600 9800
## 1 23 1 4 1
## 10000 10500 10800 11000 12000
## 46 1 1 5 34
## 13000 13500 14000 14400 15000
## 4 1 2 2 12
## 16000 16800 17000 18000 20000
## 5 3 3 3 3
## 21000 25000 30000 35000 40000
## 1 5 2 2 1
## 50000 65000 90600
## 1 1 1
## [1] "Frequency table after encoding"
## a213_amt_paid_. 213 How much did you have to pay in fees to [Name]’s school in order to enroll h
## 0 10 20 25 30
## 3289 3 1 1 4
## 40 50 52 60 70
## 1 62 1 14 4
## 75 80 Don't know 100 110
## 1 2 151 307 2
## 115 120 125 130 140
## 1 4 4 9 2
## 150 160 170 180 200
## 73 12 15 8 282
## 210 220 240 250 255
## 23 6 3 79 11
## 270 275 300 310 315
## 1 2 81 1 2
## 320 325 340 350 355
## 6 2 1 42 2
## 360 365 370 375 380
## 28 13 5 5 4
## 390 400 405 410 415
## 3 65 1 1 2
## 425 430 450 460 465
## 2 1 27 9 2
## 470 475 480 485 490
## 3 8 10 2 1
## 495 500 510 515 520
## 1 174 2 2 6
## 525 540 550 560 565
## 11 7 19 11 3
## 570 580 595 600 608
## 8 5 6 81 1
## 610 615 620 625 630
## 1 1 3 6 3
## 635 640 650 660 665
## 2 2 21 5 4
## 670 675 700 710 715
## 1 2 55 1 1
## 720 725 730 750 760
## 5 1 1 17 2
## 770 775 800 850 852
## 3 3 17 3 1
## 860 865 900 915 945
## 1 1 11 1 1
## 950 960 990 Not applicable 1000
## 2 1 1 9802 49
## 1070 1100 1150 1200 1230
## 1 13 1 16 1
## 1300 1350 1375 1400 1440
## 2 2 1 3 3
## 1450 1500 1600 1615 1700
## 2 24 2 1 3
## 1800 1900 2000 2200 2350
## 6 1 29 2 2
## 2400 2500 2800 2900 3000
## 8 16 5 1 69
## 3500 3600 4000 4100 4200
## 13 14 35 1 1
## 4500 4600 4800 5000 5400
## 12 1 7 57 5
## 5500 5800 6000 6300 6500
## 12 1 59 1 6
## 6600 6700 7000 7080 7200
## 2 2 34 1 12
## 7500 7600 8000 8400 8500
## 3 2 32 2 1
## 8600 9000 9500 9600 9800
## 1 23 1 4 1
## 10000 10500 10800 11000 12000
## 46 1 1 5 34
## 13000 13500 14000 14400 15000 or more
## 4 1 2 2 43
mydata <- top_recode (variable="a214_schl_edu_fess_", break_point=20000, missing=NA)
## [1] "Frequency table before encoding"
## a214_schl_edu_fess_. 214 In the last 12 months, how much has this household spent out of pocket for [
## Don't know 100 200 250 300
## 197 6 11 3 29
## 375 400 450 490 500
## 1 17 2 1 163
## 520 570 600 650 700
## 1 1 26 2 61
## 750 800 850 900 920
## 1 73 1 12 1
## 970 975 Not applicable 1000 1050
## 1 1 9802 737 2
## 1100 1150 1200 1233 1240
## 8 1 114 1 1
## 1250 1300 1320 1360 1400
## 3 17 1 1 9
## 1433 1450 1500 1600 1610
## 2 1 662 23 1
## 1666 1700 1800 1870 2000
## 2 5 8 1 1182
## 2050 2100 2200 2300 2330
## 1 6 5 4 2
## 2500 2600 2700 3000 3100
## 326 1 3 780 2
## 3300 3500 3600 3800 4000
## 3 49 4 3 372
## 4500 4800 5000 5200 5500
## 10 2 552 1 4
## 5600 5800 6000 6080 6200
## 2 1 114 1 1
## 6450 6500 7000 7200 8000
## 1 1 64 1 64
## 9000 9200 9500 9800 10000
## 7 2 1 1 96
## 11000 11100 12000 12300 13000
## 1 1 22 1 5
## 14000 15000 16000 17000 20000
## 3 20 1 2 14
## 22000 23000 24000 25000 27000
## 2 1 1 6 1
## 30000 35000 50000 70000 80000
## 5 1 2 1 1
## 150000
## 1
## [1] "Frequency table after encoding"
## a214_schl_edu_fess_. 214 In the last 12 months, how much has this household spent out of pocket for [
## Don't know 100 200 250 300
## 197 6 11 3 29
## 375 400 450 490 500
## 1 17 2 1 163
## 520 570 600 650 700
## 1 1 26 2 61
## 750 800 850 900 920
## 1 73 1 12 1
## 970 975 Not applicable 1000 1050
## 1 1 9802 737 2
## 1100 1150 1200 1233 1240
## 8 1 114 1 1
## 1250 1300 1320 1360 1400
## 3 17 1 1 9
## 1433 1450 1500 1600 1610
## 2 1 662 23 1
## 1666 1700 1800 1870 2000
## 2 5 8 1 1182
## 2050 2100 2200 2300 2330
## 1 6 5 4 2
## 2500 2600 2700 3000 3100
## 326 1 3 780 2
## 3300 3500 3600 3800 4000
## 3 49 4 3 372
## 4500 4800 5000 5200 5500
## 10 2 552 1 4
## 5600 5800 6000 6080 6200
## 2 1 114 1 1
## 6450 6500 7000 7200 8000
## 1 1 64 1 64
## 9000 9200 9500 9800 10000
## 7 2 1 1 96
## 11000 11100 12000 12300 13000
## 1 1 22 1 5
## 14000 15000 16000 17000 20000 or more
## 3 20 1 2 36
# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)
indirect_PII <- c("a206_relation_",
"a207_complete_edu_",
"a208_employ_status_")
capture_tables (indirect_PII)
# Recode those with very specific values.
break_rel <- c(1,2,3,4,5,99)
labels_rel <- c("Single/not committed" =1,
"Single, committed or engaged" =2,
"Currently Married and cohabitating" =3,
"Married but not cohabitating" =4,
"Divorced/Widowed/Widower" =5)
mydata <- ordinal_recode (variable="a206_relation_", break_points=break_rel, missing=999999, value_labels=labels_rel)
## [1] "Frequency table before encoding"
## a206_relation_. 206 What is [Name]'s relationship status?
## Single/not committed Single, committed or engaged
## 6366 624
## Currently Married and cohabitating Married but not cohabitating
## 5671 1174
## Divorced Widowed/Widower
## 23 720
## Not applicable
## 1196
## recoded
## [1,2) [2,3) [3,4) [4,5) [5,99) [99,1e+06)
## 1 6366 0 0 0 0 0
## 2 0 624 0 0 0 0
## 3 0 0 5671 0 0 0
## 4 0 0 0 1174 0 0
## 5 0 0 0 0 23 0
## 6 0 0 0 0 720 0
## 999 0 0 0 0 0 1196
## [1] "Frequency table after encoding"
## a206_relation_. 206 What is [Name]'s relationship status?
## Single/not committed Single, committed or engaged
## 6366 624
## Currently Married and cohabitating Married but not cohabitating
## 5671 1174
## Divorced/Widowed/Widower 6
## 743 1196
## [1] "Inspect value labels and relabel as necessary"
## Single/not committed Single, committed or engaged
## 1 2
## Currently Married and cohabitating Married but not cohabitating
## 3 4
## Divorced/Widowed/Widower
## 5
val_labels(mydata$a207_complete_edu_)
## Never Attended School Grade 1
## 0 1
## Grade 2 Grade 3
## 2 3
## Grade 4 Grade 5
## 4 5
## Grade 6 Grade 7
## 6 7
## Grade 8 Grade 9
## 8 9
## Grade 10 Grade 11
## 10 11
## Grade 12 University / Not Graduate
## 12 13
## University / Graduate Post Bachelors Tertiary Education
## 14 15
## Technical training Professional studies
## 16 17
## Pre-primary Don't know
## 18 98
## Not applicable Don't know
## 999 NA
## Not applicable
## NA
break_edu <- c(0:14,18,19)
labels_edu <- c("Never Attended School or Only Attended Pre-School" = 1,
"Grade 1" = 2,
"Grade 2" = 3,
"Grade 3" = 4,
"Grade 4" = 5,
"Grade 5" = 6,
"Grade 6" = 7,
"Grade 7" = 8,
"Grade 8" = 9,
"Grade 9" = 10,
"Grade 10" = 11,
"Grade 11" = 12,
"Grade 12" = 13,
"University / Not Graduate" = 14,
"University / Graduate or higher" = 15,
"Below primary" = 16)
mydata <- ordinal_recode (variable="a207_complete_edu_",
break_points=break_edu,
missing=999999,
value_labels=labels_edu)
## [1] "Frequency table before encoding"
## a207_complete_edu_. 207 What is [Name]'s completed level of education?
## Never Attended School Grade 1
## 3707 336
## Grade 2 Grade 3
## 543 684
## Grade 4 Grade 5
## 631 1414
## Grade 6 Grade 7
## 718 2804
## Grade 8 Grade 9
## 1434 720
## Grade 10 Grade 11
## 715 191
## Grade 12 University / Not Graduate
## 381 100
## University / Graduate Post Bachelors Tertiary Education
## 103 31
## Technical training Professional studies
## 15 6
## Pre-primary Don't know
## 4 41
## Not applicable
## 1196
## recoded
## [0,1) [1,2) [2,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,9) [9,10) [10,11) [11,12)
## 0 3707 0 0 0 0 0 0 0 0 0 0 0
## 1 0 336 0 0 0 0 0 0 0 0 0 0
## 2 0 0 543 0 0 0 0 0 0 0 0 0
## 3 0 0 0 684 0 0 0 0 0 0 0 0
## 4 0 0 0 0 631 0 0 0 0 0 0 0
## 5 0 0 0 0 0 1414 0 0 0 0 0 0
## 6 0 0 0 0 0 0 718 0 0 0 0 0
## 7 0 0 0 0 0 0 0 2804 0 0 0 0
## 8 0 0 0 0 0 0 0 0 1434 0 0 0
## 9 0 0 0 0 0 0 0 0 0 720 0 0
## 10 0 0 0 0 0 0 0 0 0 0 715 0
## 11 0 0 0 0 0 0 0 0 0 0 0 191
## 12 0 0 0 0 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0
## 999 0 0 0 0 0 0 0 0 0 0 0 0
## recoded
## [12,13) [13,14) [14,18) [18,19) [19,1e+06)
## 0 0 0 0 0 0
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## 9 0 0 0 0 0
## 10 0 0 0 0 0
## 11 0 0 0 0 0
## 12 381 0 0 0 0
## 13 0 100 0 0 0
## 14 0 0 103 0 0
## 15 0 0 31 0 0
## 16 0 0 15 0 0
## 17 0 0 6 0 0
## 18 0 0 0 4 0
## 98 0 0 0 0 41
## 999 0 0 0 0 1196
## [1] "Frequency table after encoding"
## a207_complete_edu_. 207 What is [Name]'s completed level of education?
## Never Attended School or Only Attended Pre-School
## 3707
## Grade 1
## 336
## Grade 2
## 543
## Grade 3
## 684
## Grade 4
## 631
## Grade 5
## 1414
## Grade 6
## 718
## Grade 7
## 2804
## Grade 8
## 1434
## Grade 9
## 720
## Grade 10
## 715
## Grade 11
## 191
## Grade 12
## 381
## University / Not Graduate
## 100
## University / Graduate or higher
## 155
## Below primary
## 4
## 17
## 1237
## [1] "Inspect value labels and relabel as necessary"
## Never Attended School or Only Attended Pre-School
## 1
## Grade 1
## 2
## Grade 2
## 3
## Grade 3
## 4
## Grade 4
## 5
## Grade 5
## 6
## Grade 6
## 7
## Grade 7
## 8
## Grade 8
## 9
## Grade 9
## 10
## Grade 10
## 11
## Grade 11
## 12
## Grade 12
## 13
## University / Not Graduate
## 14
## University / Graduate or higher
## 15
## Below primary
## 16
val_labels(mydata$a208_employ_status_)
## Not in labor force
## 0
## Unpaid Worker in Family Farm
## 1
## Unpaid Worker in Family Business
## 2
## Self Employed in Agriculture
## 3
## Self Employed in Outside of Agr
## 4
## Regular Wage or Salary Worker in Agr
## 5
## Regular Wage or Salary Worker Outside of Agr.
## 6
## Skilled worker
## 7
## Daily labourer
## 8
## Paif household worker
## 9
## Pensioner
## 10
## Farm labourer
## 11
## Chef
## 12
## Priest
## 13
break_edu <- c(0:9,11,12)
labels_edu <- c("Not in labor force" = 0,
"Unpaid Worker in Family Farm" = 1,
"Unpaid Worker in Family Business" = 2,
"Self Employed in Agriculture" = 3,
"Self Employed in Outside of Agr" = 4,
"Regular Wage or Salary Worker in Agr" = 5,
"Regular Wage or Salary Worker Outside of Agr" = 6,
"Skilled worker" = 7,
"Daily labourer" = 8,
"Other" = 9,
"Farm Labourer" = 10,
"Other" = 11)
mydata <- ordinal_recode (variable="a208_employ_status_",
break_points=break_edu,
missing=999999,
value_labels=labels_edu)
## [1] "Frequency table before encoding"
## a208_employ_status_. 208 What is [Name]'s employment status over the last 7 days?
## Not in labor force
## 9418
## Unpaid Worker in Family Farm
## 579
## Unpaid Worker in Family Business
## 142
## Self Employed in Agriculture
## 1307
## Self Employed in Outside of Agr
## 455
## Regular Wage or Salary Worker in Agr
## 184
## Regular Wage or Salary Worker Outside of Agr.
## 1646
## Skilled worker
## 479
## Daily labourer
## 277
## Paif household worker
## 4
## Pensioner
## 6
## Farm labourer
## 66
## Chef
## 1
## Priest
## 2
## 98
## 11
## 999
## 1196
## <NA>
## 1
## recoded
## [0,1) [1,2) [2,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,9) [9,11) [11,12)
## 0 9418 0 0 0 0 0 0 0 0 0 0
## 1 0 579 0 0 0 0 0 0 0 0 0
## 2 0 0 142 0 0 0 0 0 0 0 0
## 3 0 0 0 1307 0 0 0 0 0 0 0
## 4 0 0 0 0 455 0 0 0 0 0 0
## 5 0 0 0 0 0 184 0 0 0 0 0
## 6 0 0 0 0 0 0 1646 0 0 0 0
## 7 0 0 0 0 0 0 0 479 0 0 0
## 8 0 0 0 0 0 0 0 0 277 0 0
## 9 0 0 0 0 0 0 0 0 0 4 0
## 10 0 0 0 0 0 0 0 0 0 6 0
## 11 0 0 0 0 0 0 0 0 0 0 66
## 12 0 0 0 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0
## 999 0 0 0 0 0 0 0 0 0 0 0
## recoded
## [12,1e+06)
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 1
## 13 2
## 98 11
## 999 1196
## [1] "Frequency table after encoding"
## a208_employ_status_. 208 What is [Name]'s employment status over the last 7 days?
## Unpaid Worker in Family Farm
## 9418
## Unpaid Worker in Family Business
## 579
## Self Employed in Agriculture
## 142
## Self Employed in Outside of Agr
## 1307
## Regular Wage or Salary Worker in Agr
## 455
## Regular Wage or Salary Worker Outside of Agr
## 184
## Skilled worker
## 1646
## Daily labourer
## 479
## Other
## 343
## Farm Labourer
## 10
## 12
## 1210
## <NA>
## 1
## [1] "Inspect value labels and relabel as necessary"
## Not in labor force
## 0
## Unpaid Worker in Family Farm
## 1
## Unpaid Worker in Family Business
## 2
## Self Employed in Agriculture
## 3
## Self Employed in Outside of Agr
## 4
## Regular Wage or Salary Worker in Agr
## 5
## Regular Wage or Salary Worker Outside of Agr
## 6
## Skilled worker
## 7
## Daily labourer
## 8
## Other
## 9
## Farm Labourer
## 10
## Other
## 11
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('a204_gender_', 'a203_age_', 'a207_complete_edu_') ##!!! Replace with candidate categorical demo vars
# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var
# household id variable (cluster)
selectedHouseholdID = c('hh_id') ##!!! Replace with household id
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
sdcInitial
## The input dataset consists of 15774 rows and 34 variables.
## --> Categorical key variables: a204_gender_, a203_age_, a207_complete_edu_
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size
## a204_gender_ 2 (2) 7887.000 (7887.000)
## a203_age_ 80 (80) 197.175 (197.175)
## a207_complete_edu_ 17 (17) 927.882 (927.882)
## Size of smallest (>0)
## 6581 (6581)
## 1 (1)
## 4 (4)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 305 (1.934%)
## - 3-anonymity: 617 (3.911%)
## - 5-anonymity: 1222 (7.747%)
##
## ----------------------------------------------------------------------
Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 305 x 3
## a204_gender_ a203_age_ a207_complete_edu_
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 0 [Female] 11 9 [Grade 8]
## 2 0 [Female] 17 2 [Grade 1]
## 3 0 [Female] 15 15 [University / Graduate or higher]
## 4 0 [Female] 20 2 [Grade 1]
## 5 0 [Female] 12 13 [Grade 12]
## 6 0 [Female] 15 17
## 7 1 [Male] 53 8 [Grade 7]
## 8 1 [Male] 44 4 [Grade 3]
## 9 0 [Female] 75 9 [Grade 8]
## 10 0 [Female] 42 5 [Grade 4]
## # ... with 295 more rows
sdcFinal <- localSuppression(sdcInitial)
# Recombining anonymized variables
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## a204_gender_ a203_age_ a207_complete_edu_
## 137 0 NA 9
## 251 0 NA 2
## 921 0 NA 15
## 1050 0 NA 2
## 1813 0 NA 13
## 2257 0 NA 17
## 2435 1 NA 8
## 2473 1 NA 4
## 2479 0 NA 9
## 2494 0 NA 5
## 2543 0 NA 5
## 2557 1 NA 8
## 2588 0 NA 14
## 2656 1 NA 5
## 2703 0 NA 8
## 2818 1 NA 2
## 2855 0 NA 8
## 2867 0 NA 10
## 2887 0 NA 11
## 2889 0 NA 15
## 2923 1 NA 11
## 2959 0 NA 11
## 2966 1 NA 4
## 3055 0 NA 9
## 3116 1 NA 9
## 3143 1 NA 9
## 3159 1 NA 5
## 3210 1 NA 14
## 3235 1 NA 7
## 3251 0 NA 9
## 3253 0 NA 11
## 3265 0 NA 9
## 3348 0 NA 17
## 3355 1 NA 15
## 3357 0 NA 9
## 3387 0 NA 4
## 3452 0 NA 15
## 3454 0 NA 12
## 3623 1 NA 3
## 3646 1 NA 5
## 3658 0 NA 13
## 3798 1 NA 9
## 3858 1 NA 4
## 3932 0 NA 3
## 3956 1 NA 9
## 3971 1 NA 12
## 4074 1 NA 15
## 4107 1 NA 3
## 4207 1 NA 8
## 4231 0 NA 13
## 4259 0 NA 2
## 4319 0 7 NA
## 4326 1 NA 9
## 4391 1 NA 1
## 4412 1 NA 2
## 4480 1 NA 5
## 4505 1 NA 8
## 4507 1 NA 8
## 4564 0 NA 2
## 4620 1 NA 12
## 4657 1 NA 2
## 4666 1 NA 7
## 4679 0 NA 3
## 4712 1 NA 2
## 4713 0 NA 2
## 4714 1 NA 4
## 4722 0 NA 5
## 4726 0 NA 6
## 4736 1 NA 12
## 4749 1 NA 8
## 4867 1 NA 15
## 4906 1 NA 7
## 4931 1 NA 14
## 4934 0 NA 3
## 4936 0 NA 15
## 4961 1 NA 12
## 4970 0 NA 4
## 5153 0 NA 4
## 5168 1 NA 3
## 5268 1 NA 3
## 5279 1 NA 12
## 5297 0 NA 3
## 5312 1 NA 2
## 5323 1 NA 5
## 5385 1 NA 8
## 5396 0 NA 7
## 5455 1 NA 13
## 5537 0 NA 5
## 5588 1 NA 7
## 5635 1 NA 5
## 5742 1 NA 6
## 5764 1 NA 12
## 5783 1 NA 13
## 5789 0 NA 13
## 5837 1 NA 3
## 5855 1 NA 9
## 5888 1 NA 7
## 5936 1 NA 11
## 6008 0 NA 4
## 6076 1 NA 5
## 6228 0 NA 9
## 6365 1 NA 15
## 6421 0 NA 6
## 6442 0 NA 9
## 6443 0 NA 15
## 6447 1 NA 5
## 6474 0 NA 3
## 6479 0 NA 1
## 6481 0 NA 2
## 6630 1 NA 10
## 6669 0 NA 1
## 6692 0 NA 11
## 6754 1 NA 3
## 6762 1 NA 7
## 6771 1 NA 3
## 6796 0 NA 11
## 6805 0 NA 8
## 6813 0 NA 4
## 6832 1 NA 15
## 6948 0 NA 5
## 6962 1 NA 11
## 7032 0 NA 6
## 7133 1 NA 3
## 7139 0 NA 6
## 7141 0 NA 12
## 7251 1 NA 2
## 7292 1 NA 11
## 7303 1 NA 8
## 7349 0 NA 8
## 7409 1 NA 15
## 7424 1 NA 15
## 7425 0 NA 4
## 7543 1 NA 1
## 7560 0 NA 13
## 7583 0 NA 5
## 7620 1 NA 4
## 7668 0 NA 8
## 7678 0 NA 8
## 7751 1 NA 5
## 7774 1 NA 3
## 7780 1 NA 13
## 7797 1 NA 14
## 7827 0 NA 8
## 7838 0 NA 10
## 7881 1 NA 15
## 7912 1 NA 6
## 7939 0 NA 15
## 7942 0 NA 15
## 7944 0 NA 12
## 7946 1 NA 14
## 7994 1 NA 10
## 8039 1 NA 17
## 8044 1 NA 5
## 8109 0 NA 7
## 8126 1 NA 2
## 8143 1 NA 1
## 8240 1 NA 10
## 8242 1 NA 8
## 8290 0 NA 13
## 8476 1 NA 5
## 8616 1 NA 14
## 8654 1 NA 7
## 8681 1 NA 17
## 8907 1 NA 1
## 8950 0 NA 13
## 9010 0 NA 6
## 9021 0 NA 7
## 9067 0 NA 14
## 9095 0 NA 6
## 9152 0 NA 4
## 9154 0 NA 6
## 9155 0 NA 4
## 9175 0 NA 11
## 9209 0 NA 6
## 9241 1 NA 12
## 9270 0 NA 10
## 9307 1 NA 4
## 9393 0 NA 3
## 9446 0 NA 13
## 9565 1 NA 14
## 9635 0 NA 17
## 9641 0 NA 4
## 9814 1 NA 4
## 9840 1 NA 12
## 9888 0 NA 5
## 9924 1 NA 5
## 10010 1 NA 6
## 10123 1 NA 8
## 10140 1 NA 6
## 10143 1 NA 12
## 10223 1 NA 9
## 10251 0 NA 12
## 10254 1 NA 14
## 10264 1 NA 7
## 10291 1 NA 13
## 10317 0 NA 2
## 10336 1 NA 7
## 10375 0 NA 9
## 10402 0 NA 13
## 10409 1 NA 9
## 10468 1 NA 10
## 10551 1 NA 11
## 10561 1 NA 12
## 10595 1 NA 3
## 10606 1 NA 13
## 10693 0 NA 3
## 10726 1 NA 2
## 10729 0 NA 2
## 10823 1 NA 12
## 10913 0 NA 15
## 10935 1 NA 3
## 11189 1 NA 10
## 11219 1 NA 3
## 11307 1 NA 8
## 11335 1 NA 15
## 11360 1 NA 4
## 11426 1 NA 5
## 11457 1 NA 12
## 11464 1 NA 6
## 11500 1 NA 2
## 11528 1 NA 6
## 11607 1 NA 9
## 11610 0 NA 4
## 11623 1 NA 8
## 11632 1 NA 6
## 11977 1 NA 4
## 11990 1 NA 3
## 12006 0 NA 3
## 12035 1 NA 13
## 12062 1 NA 15
## 12138 1 NA 7
## 12143 1 NA 12
## 12172 1 NA 10
## 12299 0 NA 5
## 12313 0 NA 2
## 12361 1 NA 12
## 12362 0 NA 3
## 12475 1 NA 7
## 12625 0 NA 13
## 12643 0 NA 5
## 12697 0 NA 6
## 12701 0 NA 12
## 12863 1 NA 2
## 12902 1 NA 17
## 13003 1 NA 6
## 13007 1 NA 17
## 13070 1 NA 2
## 13075 1 NA 5
## 13126 1 NA 17
## 13181 0 NA 1
## 13345 1 NA 7
## 13491 0 NA 10
## 13531 1 NA 4
## 13538 1 NA 7
## 13643 1 NA 13
## 13734 0 NA 5
## 13778 0 NA 10
## 13786 1 NA 1
## 13812 1 NA 13
## 13854 1 NA 8
## 13906 1 NA 11
## 13975 1 NA 3
## 14030 1 NA 17
## 14031 1 NA 13
## 14077 0 NA 12
## 14089 0 NA 5
## 14094 0 NA 13
## 14099 1 NA 2
## 14122 1 NA 17
## 14142 1 NA 13
## 14174 0 NA 2
## 14230 0 NA 17
## 14256 0 NA 17
## 14272 1 NA 10
## 14301 0 NA 6
## 14572 1 NA 10
## 14595 0 NA 9
## 14629 1 NA 15
## 14707 0 NA 5
## 14709 1 NA 15
## 14715 0 NA 9
## 14747 0 NA 12
## 14765 1 NA 7
## 14801 1 NA 2
## 14802 0 NA 6
## 14826 1 NA 3
## 14846 1 NA 4
## 14862 0 NA 5
## 14874 0 NA 5
## 14958 0 NA 5
## 15210 1 NA 4
## 15211 0 NA 5
## 15305 1 NA 8
## 15347 0 NA 5
## 15397 1 NA 4
## 15517 1 NA 1
## 15525 0 NA 3
## 15539 1 NA 11
## 15540 1 NA 14
## 15622 0 NA 5
## 15624 0 NA 17
## 15682 0 NA 9
## 15722 1 NA 6
## 15744 0 NA 15
## 15766 0 NA 3
mydata [notAnon,"a203_age_"][mydata[notAnon,"a203_age_"]>17] <- NA
sdcInitial <- createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
## # A tibble: 1 x 3
## a204_gender_ a203_age_ a207_complete_edu_
## <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 0 [Female] 7 16 [Below primary]
sdcFinal <- localSuppression(sdcInitial)
extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (cc != class(v_p)) {: the condition has length > 1 and only the first
## element will be used
## a204_gender_ a203_age_ a207_complete_edu_
## 4319 0 7 NA
mydata [notAnon,"a204_gender_"] <- NA
createSdcObj(dat = mydata,
keyVars = selectedKeyVars,
hhId = selectedHouseholdID)
## The input dataset consists of 15774 rows and 34 variables.
## --> Categorical key variables: a204_gender_, a203_age_, a207_complete_edu_
## --> Cluster/Household-Id variable: hh_id
## ----------------------------------------------------------------------
## Information on categorical key variables:
##
## Reported is the number, mean size and size of the smallest category >0 for recoded variables.
## In parenthesis, the same statistics are shown for the unmodified data.
## Note: NA (missings) are counted as seperate categories!
## Key Variable Number of categories Mean size
## a204_gender_ 3 (3) 7886.500 (7886.500)
## a203_age_ 78 (78) 201.195 (201.195)
## a207_complete_edu_ 17 (17) 927.882 (927.882)
## Size of smallest (>0)
## 6581 (6581)
## 2 (2)
## 4 (4)
## ----------------------------------------------------------------------
## Infos on 2/3-Anonymity:
##
## Number of observations violating
## - 2-anonymity: 0 (0.000%)
## - 3-anonymity: 0 (0.000%)
## - 5-anonymity: 9 (0.057%)
##
## ----------------------------------------------------------------------
# !!! No open-ends
# !!! No GPS data
Adds "_PU" (Public Use) to the end of the name
haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))
# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)