import pandas as pd
import numpy as np
Process JMP sanitation data and add the treatment data
First read the JMP processed sanitation surveys
df_jmp = pd.read_csv("jmp_sanitation_surveys.csv")
df_jmp
country | alpha.2 | alpha.3 | numeric | context | classific_id | classification | percentage | source | |
---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | AF | AFG | 4 | Urban | 1 | Flush and pour flush - to piped sewer system | 0.000 | AFG_1997_MICS |
1 | Afghanistan | AF | AFG | 4 | Urban | 2 | Flush and pour flush - to septic tank | 0.000 | AFG_1997_MICS |
2 | Afghanistan | AF | AFG | 4 | Urban | 3 | Flush and pour flush - to pit | 0.000 | AFG_1997_MICS |
3 | Afghanistan | AF | AFG | 4 | Urban | 4 | Flush and pour flush - to unknown place/ not s... | 0.000 | AFG_1997_MICS |
4 | Afghanistan | AF | AFG | 4 | Urban | 5 | Flush and pour flush - to elsewhere | 0.000 | AFG_1997_MICS |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
398849 | Zimbabwe | ZW | ZWE | 716 | National | 13 | Composting toilets | 0.007 | ZWE_2022_CEN |
398850 | Zimbabwe | ZW | ZWE | 716 | National | 14 | Other improved | 0.000 | ZWE_2022_CEN |
398851 | Zimbabwe | ZW | ZWE | 716 | National | 15 | No facility, bush, field | 0.198 | ZWE_2022_CEN |
398852 | Zimbabwe | ZW | ZWE | 716 | National | 16 | Other unimproved | 0.003 | ZWE_2022_CEN |
398853 | Zimbabwe | ZW | ZWE | 716 | National | 17 | DK/missing information | 0.000 | ZWE_2022_CEN |
398854 rows × 9 columns
df_jmp = df_jmp.rename(columns={'alpha.3':'iso3'}).drop(columns=["alpha.2","numeric"])
df_jmp['year'] = df_jmp['source'].str[4:8].apply(lambda x: int(x))
df_jmp = df_jmp.set_index(["source","context","classific_id"])
df_jmp.head()
country | iso3 | classification | percentage | year | |||
---|---|---|---|---|---|---|---|
source | context | classific_id | |||||
AFG_1997_MICS | Urban | 1 | Afghanistan | AFG | Flush and pour flush - to piped sewer system | 0.0 | 1997 |
2 | Afghanistan | AFG | Flush and pour flush - to septic tank | 0.0 | 1997 | ||
3 | Afghanistan | AFG | Flush and pour flush - to pit | 0.0 | 1997 | ||
4 | Afghanistan | AFG | Flush and pour flush - to unknown place/ not s... | 0.0 | 1997 | ||
5 | Afghanistan | AFG | Flush and pour flush - to elsewhere | 0.0 | 1997 |
The GloWPa model uses different sanitation categories. In the next steps the JPM sanitation categories are mapped to the GloWPa sanitation categories.
df_harmon = pd.DataFrame({
'classific_id': range(1, 18),
'san': ["flushSewer", "flushSeptic", "flushPit", "flushUnknown", "flushOpen",
"pitSlab", "pitSlab", "pitSlab", "pitNoSlab", "hangingToilet", "bucketLatrine",
"other", "compostingToilet", "pitSlab", "openDefecation", "other", "other"]
}).set_index("classific_id")
df_jmp = df_jmp.join(df_harmon, on = "classific_id", how = "left")
df_jmp.head(10)
country | iso3 | classification | percentage | year | san | |||
---|---|---|---|---|---|---|---|---|
source | context | classific_id | ||||||
AFG_1997_MICS | Urban | 1 | Afghanistan | AFG | Flush and pour flush - to piped sewer system | 0.000 | 1997 | flushSewer |
2 | Afghanistan | AFG | Flush and pour flush - to septic tank | 0.000 | 1997 | flushSeptic | ||
3 | Afghanistan | AFG | Flush and pour flush - to pit | 0.000 | 1997 | flushPit | ||
4 | Afghanistan | AFG | Flush and pour flush - to unknown place/ not s... | 0.000 | 1997 | flushUnknown | ||
5 | Afghanistan | AFG | Flush and pour flush - to elsewhere | 0.000 | 1997 | flushOpen | ||
6 | Afghanistan | AFG | Dry latrines - Ventilated Improved Pit latrine | 0.000 | 1997 | pitSlab | ||
7 | Afghanistan | AFG | Dry latrines - Pit latrine with slab/covered l... | 0.017 | 1997 | pitSlab | ||
8 | Afghanistan | AFG | Dry latrines - Traditional latrine | 0.171 | 1997 | pitSlab | ||
9 | Afghanistan | AFG | Dry latrines - Pit latrine without slab/open pit | 0.522 | 1997 | pitNoSlab | ||
10 | Afghanistan | AFG | Dry latrines - Hanging toilet/hanging latrine | 0.000 | 1997 | hangingToilet |
Now, we need to check if the sanitation fractions from the surveys are complete for each survey (source) and at national, urban and rural level (context).
df_source_sums = df_jmp.groupby(["source","context"])["percentage"].sum()
df_source_sums.sort_values()
source context
KGZ_2008_WW Rural 0.000
LVA_2015_PWH Urban 0.000
LVA_2016_ES Rural 0.000
Urban 0.000
LVA_2016_ESDW National 0.000
...
NGA_2006_GHS Rural 1.344
MEX_1998_ENIGH Rural 1.394
MEX_1996_ENIGH Rural 1.402
EST_2006_HBS Rural 1.429
MEX_1992_ENIGH Rural 1.471
Name: percentage, Length: 23462, dtype: float64
We include all surveys when the total fraction is between 0.95 and 1.05. Later these fractions will be normalized.
complete_threshold_lower = 0.95
complete_threshold_upper = 1.05
df_complete_sources = df_source_sums[(df_source_sums > complete_threshold_lower) & (df_source_sums < complete_threshold_upper)]
df_complete_sources.sort_values()
source context
MDG_2005_EP National 0.950
TZA_1996_DHS Urban 0.950
MRT_2000_EPCV Rural 0.950
MNG_2010_HSES Rural 0.951
RUS_2003_WHS Rural 0.951
...
CHN_2003_NHSS Rural 1.043
MNG_2015_HSES Rural 1.046
DEU_2012_EUSILC National 1.048
MEX_2008_ENIGH Urban 1.048
BRA_2008_PNAD Rural 1.049
Name: percentage, Length: 6896, dtype: float64
df_jmp_complete = df_jmp.reset_index("classific_id").loc[df_complete_sources.index]
df_jmp_complete.head()
classific_id | country | iso3 | classification | percentage | year | san | ||
---|---|---|---|---|---|---|---|---|
source | context | |||||||
ABW_1990_CEN | National | 1 | Aruba | ABW | Flush and pour flush - to piped sewer system | 0.132 | 1990 | flushSewer |
National | 2 | Aruba | ABW | Flush and pour flush - to septic tank | 0.823 | 1990 | flushSeptic | |
National | 3 | Aruba | ABW | Flush and pour flush - to pit | 0.000 | 1990 | flushPit | |
National | 4 | Aruba | ABW | Flush and pour flush - to unknown place/ not s... | 0.000 | 1990 | flushUnknown | |
National | 5 | Aruba | ABW | Flush and pour flush - to elsewhere | 0.000 | 1990 | flushOpen |
Now for each country and context, we need the most recent year of complete data
recent_year_idx = df_jmp_complete.reset_index().groupby(["iso3","context"])["year"].idxmax()
idx_complete = df_jmp_complete.iloc[recent_year_idx].index
idx_complete
MultiIndex([( 'ABW_2020_CEN', 'National'),
('AFG_2020_IELFS', 'National'),
('AFG_2020_IELFS', 'Rural'),
('AFG_2020_IELFS', 'Urban'),
( 'AGO_2016_IIMS', 'National'),
( 'AGO_2016_IIMS', 'Rural'),
( 'AGO_2016_IIMS', 'Urban'),
( 'AIA_2009_CPA', 'National'),
( 'AIA_2009_CPA', 'Urban'),
( 'ALB_2018_DHS', 'National'),
...
( 'YEM_2019_WFP', 'Urban'),
( 'ZAF_2021_GHS', 'National'),
( 'ZAF_2021_GHS', 'Rural'),
( 'ZAF_2021_GHS', 'Urban'),
( 'ZMB_2018_DHS', 'National'),
( 'ZMB_2018_DHS', 'Rural'),
( 'ZMB_2018_DHS', 'Urban'),
( 'ZWE_2019_MICS', 'National'),
( 'ZWE_2019_MICS', 'Rural'),
( 'ZWE_2019_MICS', 'Urban')],
names=['source', 'context'], length=619)
df_jmp_complete_recent = df_jmp_complete.loc[idx_complete]
df_jmp_complete_recent
classific_id | country | iso3 | classification | percentage | year | san | ||
---|---|---|---|---|---|---|---|---|
source | context | |||||||
ABW_2020_CEN | National | 1 | Aruba | ABW | Flush and pour flush - to piped sewer system | 0.000 | 2020 | flushSewer |
National | 2 | Aruba | ABW | Flush and pour flush - to septic tank | 0.000 | 2020 | flushSeptic | |
National | 3 | Aruba | ABW | Flush and pour flush - to pit | 0.000 | 2020 | flushPit | |
National | 4 | Aruba | ABW | Flush and pour flush - to unknown place/ not s... | 0.000 | 2020 | flushUnknown | |
National | 5 | Aruba | ABW | Flush and pour flush - to elsewhere | 0.000 | 2020 | flushOpen | |
... | ... | ... | ... | ... | ... | ... | ... | ... |
ZWE_2019_MICS | Urban | 13 | Zimbabwe | ZWE | Composting toilets | 0.001 | 2019 | compostingToilet |
Urban | 14 | Zimbabwe | ZWE | Other improved | 0.000 | 2019 | pitSlab | |
Urban | 15 | Zimbabwe | ZWE | No facility, bush, field | 0.007 | 2019 | openDefecation | |
Urban | 16 | Zimbabwe | ZWE | Other unimproved | 0.000 | 2019 | other | |
Urban | 17 | Zimbabwe | ZWE | DK/missing information | 0.000 | 2019 | other |
10523 rows × 7 columns
Now normalize the fractions so all fractions will sum up to one
df_jmp_complete_recent = df_jmp_complete_recent.join(df_complete_sources.rename('sum_frac'))
df_jmp_complete_recent['frac'] = df_jmp_complete_recent['percentage'] / df_jmp_complete_recent['sum_frac']
df_jmp_complete_recent.head()
classific_id | country | iso3 | classification | percentage | year | san | sum_frac | frac | ||
---|---|---|---|---|---|---|---|---|---|---|
source | context | |||||||||
ABW_2020_CEN | National | 1 | Aruba | ABW | Flush and pour flush - to piped sewer system | 0.0 | 2020 | flushSewer | 1.0 | 0.0 |
National | 2 | Aruba | ABW | Flush and pour flush - to septic tank | 0.0 | 2020 | flushSeptic | 1.0 | 0.0 | |
National | 3 | Aruba | ABW | Flush and pour flush - to pit | 0.0 | 2020 | flushPit | 1.0 | 0.0 | |
National | 4 | Aruba | ABW | Flush and pour flush - to unknown place/ not s... | 0.0 | 2020 | flushUnknown | 1.0 | 0.0 | |
National | 5 | Aruba | ABW | Flush and pour flush - to elsewhere | 0.0 | 2020 | flushOpen | 1.0 | 0.0 |
df_jmp_recent_agg = df_jmp_complete_recent.reset_index().groupby(["iso3","context","san"])[['frac','source']].agg({'frac':'sum','source':'first'})
df_jmp_recent_agg
frac | source | |||
---|---|---|---|---|
iso3 | context | san | ||
ABW | National | bucketLatrine | 0.000000 | ABW_2020_CEN |
compostingToilet | 0.000000 | ABW_2020_CEN | ||
flushOpen | 0.000000 | ABW_2020_CEN | ||
flushPit | 0.000000 | ABW_2020_CEN | ||
flushSeptic | 0.000000 | ABW_2020_CEN | ||
... | ... | ... | ... | ... |
ZWE | Urban | hangingToilet | 0.000000 | ZWE_2019_MICS |
openDefecation | 0.007007 | ZWE_2019_MICS | ||
other | 0.000000 | ZWE_2019_MICS | ||
pitNoSlab | 0.007007 | ZWE_2019_MICS | ||
pitSlab | 0.035035 | ZWE_2019_MICS |
7428 rows × 2 columns
Create a empty data.frame with all countries (iso3), sanitation types (san) and context. Update the rows with most complete and recent surveys. There will be rows with missing data.
iso3_vals = df_jmp['iso3'].unique()
context = ["Urban","Rural"]
sani_type = df_harmon['san'].unique()
df_jmp_glowpa = pd.DataFrame({"frac": np.nan, "source":None},index = pd.MultiIndex.from_product([iso3_vals,context,sani_type], names=["iso3","context","san"]))
df_jmp_glowpa.update(df_jmp_recent_agg, join="left")
df_jmp_glowpa.head()
frac | source | |||
---|---|---|---|---|
iso3 | context | san | ||
AFG | Urban | flushSewer | 0.100 | AFG_2020_IELFS |
flushSeptic | 0.226 | AFG_2020_IELFS | ||
flushPit | 0.023 | AFG_2020_IELFS | ||
flushUnknown | 0.000 | AFG_2020_IELFS | ||
flushOpen | 0.010 | AFG_2020_IELFS |
Now, manipulate the national data and use these values when urban and rural sanitation fractions are missing.
# slice the most recent and complate national data
df_jmp_recent_agg_nat = df_jmp_recent_agg.xs("National", level = "context")
# manipulate the context, so we can use these indices to update the final data.frame.
df_jmp_recent_agg_nat['context'] = 'Urban'
df_jmp_recent_agg_nat2 = df_jmp_recent_agg_nat.copy()
# do same for rural
df_jmp_recent_agg_nat2['context'] = 'Rural'
df_jmp_recent_agg_nat = pd.concat([df_jmp_recent_agg_nat, df_jmp_recent_agg_nat2]).reset_index().set_index(['iso3','context','san'])
df_jmp_recent_agg_nat
/tmp/ipykernel_45892/252835684.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_jmp_recent_agg_nat['context'] = 'Urban'
frac | source | |||
---|---|---|---|---|
iso3 | context | san | ||
ABW | Urban | bucketLatrine | 0.000000 | ABW_2020_CEN |
compostingToilet | 0.000000 | ABW_2020_CEN | ||
flushOpen | 0.000000 | ABW_2020_CEN | ||
flushPit | 0.000000 | ABW_2020_CEN | ||
flushSeptic | 0.000000 | ABW_2020_CEN | ||
... | ... | ... | ... | ... |
ZWE | Rural | hangingToilet | 0.000000 | ZWE_2019_MICS |
openDefecation | 0.216783 | ZWE_2019_MICS | ||
other | 0.000999 | ZWE_2019_MICS | ||
pitNoSlab | 0.092907 | ZWE_2019_MICS | ||
pitSlab | 0.351648 | ZWE_2019_MICS |
5400 rows × 2 columns
count the number of NA
df_jmp_glowpa.isna().sum()
frac 816
source 816
dtype: int64
join national data when urban and/or rural values are missing
df_jmp_glowpa.update(df_jmp_recent_agg_nat, join="left", overwrite=False)
count the number of NA again
df_jmp_glowpa.isna().sum()
frac 132
source 132
dtype: int64
Check which indices have missing values.
df_jmp_glowpa[df_jmp_glowpa.isna().any(axis = 1)].reset_index('san').index.unique()
MultiIndex([('BWA', 'Rural'),
('COK', 'Urban'),
('COK', 'Rural'),
('GNQ', 'Urban'),
('GNQ', 'Rural'),
('NCL', 'Urban'),
('NCL', 'Rural'),
('SPM', 'Urban'),
('SPM', 'Rural'),
('TKL', 'Urban'),
('TKL', 'Rural')],
names=['iso3', 'context'])
BWA and GNQ could adopt the values from either Rural or Urban. For we drop them. validate the final sums.
df_jmp_glowpa_cleaned = df_jmp_glowpa.dropna()
(df_jmp_glowpa_cleaned.groupby(["iso3","context"])["frac"].sum() > 1.01).sum()
0
(df_jmp_glowpa_cleaned.groupby(["iso3","context"])["frac"].sum() < 0.99).sum()
0
Read the treatment dataset.
df_treatment = pd.read_csv("jmp_treatment.csv", sep = ",")
df_treatment = df_treatment.rename(columns={"ISO3":"iso3","Residence Type":"context"}).set_index(["iso3","context","Service Level"])
df_treatment
Year | Service Type | Coverage | Population | |||
---|---|---|---|---|---|---|
iso3 | context | Service Level | ||||
ABW | total | Sewage treated | 2022 | Sanitation | 4.672102506950807 | 4973.219513523787 |
ALB | total | Faecal sludge treated | 2022 | Sanitation | 10.19487443466163 | 289771.05698001874 |
Disposed insitu | 2022 | Sanitation | 17.21067442717733 | 489182.613485291 | ||
Sewage treated | 2022 | Sanitation | 28.95681524326932 | 823045.640590645 | ||
AND | total | Faecal sludge treated | 2022 | Sanitation | 0 | 0 |
... | ... | ... | ... | ... | ... | ... |
ZAF | urban | Disposed insitu | 2022 | Sanitation | 2.378316688634978 | 973409.0605099642 |
Sewage treated | 2022 | Sanitation | 68.4513130834805 | 28016087.461214747 | ||
ZWE | urban | Faecal sludge treated | 2022 | Sanitation | 0.4039044457537767 | 21354.58153069156 |
Disposed insitu | 2022 | Sanitation | 4.847340805559787 | 256280.75037945205 | ||
Sewage treated | 2022 | Sanitation | 29.4893300372857 | 1559112.0850167093 |
1085 rows × 4 columns
Drop some headers which were read as data rows.
df_treatment = df_treatment.drop([("ISO3","Residence Type","Service Level")])
df_treatment
Year | Service Type | Coverage | Population | |||
---|---|---|---|---|---|---|
iso3 | context | Service Level | ||||
ABW | total | Sewage treated | 2022 | Sanitation | 4.672102506950807 | 4973.219513523787 |
ALB | total | Faecal sludge treated | 2022 | Sanitation | 10.19487443466163 | 289771.05698001874 |
Disposed insitu | 2022 | Sanitation | 17.21067442717733 | 489182.613485291 | ||
Sewage treated | 2022 | Sanitation | 28.95681524326932 | 823045.640590645 | ||
AND | total | Faecal sludge treated | 2022 | Sanitation | 0 | 0 |
... | ... | ... | ... | ... | ... | ... |
ZAF | urban | Disposed insitu | 2022 | Sanitation | 2.378316688634978 | 973409.0605099642 |
Sewage treated | 2022 | Sanitation | 68.4513130834805 | 28016087.461214747 | ||
ZWE | urban | Faecal sludge treated | 2022 | Sanitation | 0.4039044457537767 | 21354.58153069156 |
Disposed insitu | 2022 | Sanitation | 4.847340805559787 | 256280.75037945205 | ||
Sewage treated | 2022 | Sanitation | 29.4893300372857 | 1559112.0850167093 |
1083 rows × 4 columns
Transform percentages into fractions.
df_service = df_treatment.reset_index().pivot(index=["iso3","context"],columns="Service Level", values = "Coverage")
df_service.rename(index={"total":"National","urban":"Urban","rural": "Rural"}, level="context", inplace=True)
df_service.rename(columns={"Disposed insitu":"coverBury","Faecal sludge treated":"fecalSludgeTreated","Sewage treated":"sewageTreated"}, inplace = True)
df_service["coverBury"] = df_service["coverBury"].astype(float) / 100
df_service["fecalSludgeTreated"] = df_service["fecalSludgeTreated"].astype(float) / 100
df_service["sewageTreated"] = df_service["sewageTreated"].astype(float) / 100
df_service_subnat = df_service.loc[(slice(None) ,["Urban","Rural"]),:]
df_service_subnat
Service Level | coverBury | fecalSludgeTreated | sewageTreated | |
---|---|---|---|---|
iso3 | context | |||
ALB | Urban | 0.063935 | 0.037873 | 0.394653 |
AND | Urban | 0.000000 | 0.000000 | 1.000000 |
ARE | Urban | 0.024618 | 0.024618 | 0.935318 |
ARG | Urban | 0.115030 | 0.042154 | 0.304929 |
ARM | Urban | 0.000000 | 0.000000 | 0.000000 |
... | ... | ... | ... | ... |
XKX | Rural | 0.019912 | 0.007235 | 0.006454 |
YEM | Rural | NaN | NaN | 0.003894 |
ZAF | Rural | NaN | NaN | 0.041806 |
ZMB | Rural | 0.303562 | 0.000000 | 0.001464 |
ZWE | Rural | 0.298805 | 0.000000 | 0.005200 |
245 rows × 3 columns
Update missing data for urban and rural areas with national data if present.
df_service_nat = df_service.loc[(slice(None) ,["National"]),:]
df_service_subnat.update(df_service_nat.rename(index={"National":"Rural"}, level="context"), overwrite=False)
df_service_subnat.update(df_service_nat.rename(index={"National":"Urban"}, level="context"), overwrite=False)
df_service_subnat
Service Level | coverBury | fecalSludgeTreated | sewageTreated | |
---|---|---|---|---|
iso3 | context | |||
ALB | Urban | 0.063935 | 0.037873 | 0.394653 |
AND | Urban | 0.000000 | 0.000000 | 1.000000 |
ARE | Urban | 0.024618 | 0.024618 | 0.935318 |
ARG | Urban | 0.115030 | 0.042154 | 0.304929 |
ARM | Urban | 0.000000 | 0.000000 | 0.000000 |
... | ... | ... | ... | ... |
XKX | Rural | 0.019912 | 0.007235 | 0.006454 |
YEM | Rural | 0.076267 | 0.024272 | 0.003894 |
ZAF | Rural | 0.108661 | 0.104868 | 0.041806 |
ZMB | Rural | 0.303562 | 0.000000 | 0.001464 |
ZWE | Rural | 0.298805 | 0.000000 | 0.005200 |
245 rows × 3 columns
Here we add some required constants to the sanitation data.
df_jmp_frac_glowpa_table = df_jmp_glowpa_cleaned.reset_index().pivot(index=['iso3','context'], columns='san', values='frac')
# The GloWPa model expects the containerBased sanitation type, but it is not provided by the JMP surveys.
df_jmp_frac_glowpa_table["containerBased"] = np.nan
df_jmp_frac_glowpa_table["onsiteDumpedland"] = 0.1
df_jmp_frac_glowpa_table["emptyFrequency"] = 3
df_jmp_frac_glowpa_table["pitAdditive"] = 0
df_jmp_frac_glowpa_table["urine"] = 0
df_jmp_frac_glowpa_table["twinPits"] = 0
df_jmp_frac_glowpa_table.head()
san | bucketLatrine | compostingToilet | flushOpen | flushPit | flushSeptic | flushSewer | flushUnknown | hangingToilet | openDefecation | other | pitNoSlab | pitSlab | containerBased | onsiteDumpedland | emptyFrequency | pitAdditive | urine | twinPits | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
iso3 | context | ||||||||||||||||||
ABW | Rural | 0.000000 | 0.0 | 0.000000 | 0.000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.007000 | 0.002000 | 0.000 | 0.991 | NaN | 0.1 | 3 | 0 | 0 | 0 |
Urban | 0.000000 | 0.0 | 0.000000 | 0.000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.007000 | 0.002000 | 0.000 | 0.991 | NaN | 0.1 | 3 | 0 | 0 | 0 | |
AFG | Rural | 0.000000 | 0.0 | 0.003000 | 0.007 | 0.019000 | 0.017000 | 0.0 | 0.0 | 0.149000 | 0.006000 | 0.222 | 0.577 | NaN | 0.1 | 3 | 0 | 0 | 0 |
Urban | 0.000000 | 0.0 | 0.010000 | 0.023 | 0.226000 | 0.100000 | 0.0 | 0.0 | 0.002000 | 0.001000 | 0.152 | 0.486 | NaN | 0.1 | 3 | 0 | 0 | 0 | |
AGO | Rural | 0.001998 | 0.0 | 0.075924 | 0.000 | 0.270729 | 0.022977 | 0.0 | 0.0 | 0.627373 | 0.000999 | 0.000 | 0.000 | NaN | 0.1 | 3 | 0 | 0 | 0 |
# merge treatment
df_jmp_frac_treatment_glowpa_table = df_jmp_frac_glowpa_table.join(df_service_subnat, on=["iso3","context"], how="left")
df_jmp_frac_treatment_glowpa_table
bucketLatrine | compostingToilet | flushOpen | flushPit | flushSeptic | flushSewer | flushUnknown | hangingToilet | openDefecation | other | ... | pitSlab | containerBased | onsiteDumpedland | emptyFrequency | pitAdditive | urine | twinPits | coverBury | fecalSludgeTreated | sewageTreated | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
iso3 | context | |||||||||||||||||||||
ABW | Rural | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.007000 | 0.002000 | ... | 0.991000 | NaN | 0.1 | 3 | 0 | 0 | 0 | NaN | NaN | NaN |
Urban | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.007000 | 0.002000 | ... | 0.991000 | NaN | 0.1 | 3 | 0 | 0 | 0 | NaN | NaN | NaN | |
AFG | Rural | 0.000000 | 0.000000 | 0.003000 | 0.007000 | 0.019000 | 0.017000 | 0.000000 | 0.0 | 0.149000 | 0.006000 | ... | 0.577000 | NaN | 0.1 | 3 | 0 | 0 | 0 | NaN | NaN | NaN |
Urban | 0.000000 | 0.000000 | 0.010000 | 0.023000 | 0.226000 | 0.100000 | 0.000000 | 0.0 | 0.002000 | 0.001000 | ... | 0.486000 | NaN | 0.1 | 3 | 0 | 0 | 0 | NaN | NaN | NaN | |
AGO | Rural | 0.001998 | 0.000000 | 0.075924 | 0.000000 | 0.270729 | 0.022977 | 0.000000 | 0.0 | 0.627373 | 0.000999 | ... | 0.000000 | NaN | 0.1 | 3 | 0 | 0 | 0 | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZAF | Urban | 0.010000 | 0.000000 | 0.000000 | 0.000000 | 0.020000 | 0.853000 | 0.000000 | 0.0 | 0.003000 | 0.002000 | ... | 0.112000 | NaN | 0.1 | 3 | 0 | 0 | 0 | 0.023783 | 0.022953 | 0.684513 |
ZMB | Rural | 0.000000 | 0.000000 | 0.000000 | 0.001001 | 0.014014 | 0.005005 | 0.001001 | 0.0 | 0.155155 | 0.000000 | ... | 0.356356 | NaN | 0.1 | 3 | 0 | 0 | 0 | 0.303562 | 0.000000 | 0.001464 |
Urban | 0.000000 | 0.000000 | 0.007000 | 0.039000 | 0.144000 | 0.172000 | 0.005000 | 0.0 | 0.010000 | 0.001000 | ... | 0.430000 | NaN | 0.1 | 3 | 0 | 0 | 0 | NaN | NaN | NaN | |
ZWE | Rural | 0.000000 | 0.001001 | 0.002002 | 0.002002 | 0.040040 | 0.011011 | 0.000000 | 0.0 | 0.313313 | 0.001001 | ... | 0.497497 | NaN | 0.1 | 3 | 0 | 0 | 0 | 0.298805 | 0.000000 | 0.005200 |
Urban | 0.000000 | 0.001001 | 0.001001 | 0.007007 | 0.095095 | 0.845846 | 0.001001 | 0.0 | 0.007007 | 0.000000 | ... | 0.035035 | NaN | 0.1 | 3 | 0 | 0 | 0 | 0.048473 | 0.004039 | 0.294893 |
451 rows × 21 columns
# manipulate nan
df_jmp_frac_treatment_glowpa_table[["coverBury","fecalSludgeTreated"]] = df_jmp_frac_treatment_glowpa_table[["coverBury","fecalSludgeTreated"]].fillna(value = 0)
df_jmp_frac_treatment_glowpa_table["isWatertight"] = df_jmp_frac_treatment_glowpa_table["fecalSludgeTreated"]
df_jmp_frac_treatment_glowpa_table["hasLeach"] = df_jmp_frac_treatment_glowpa_table["fecalSludgeTreated"]
df_jmp_frac_treatment_glowpa_table
bucketLatrine | compostingToilet | flushOpen | flushPit | flushSeptic | flushSewer | flushUnknown | hangingToilet | openDefecation | other | ... | onsiteDumpedland | emptyFrequency | pitAdditive | urine | twinPits | coverBury | fecalSludgeTreated | sewageTreated | isWatertight | hasLeach | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
iso3 | context | |||||||||||||||||||||
ABW | Rural | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.007000 | 0.002000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
Urban | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.007000 | 0.002000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 | |
AFG | Rural | 0.000000 | 0.000000 | 0.003000 | 0.007000 | 0.019000 | 0.017000 | 0.000000 | 0.0 | 0.149000 | 0.006000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
Urban | 0.000000 | 0.000000 | 0.010000 | 0.023000 | 0.226000 | 0.100000 | 0.000000 | 0.0 | 0.002000 | 0.001000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 | |
AGO | Rural | 0.001998 | 0.000000 | 0.075924 | 0.000000 | 0.270729 | 0.022977 | 0.000000 | 0.0 | 0.627373 | 0.000999 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZAF | Urban | 0.010000 | 0.000000 | 0.000000 | 0.000000 | 0.020000 | 0.853000 | 0.000000 | 0.0 | 0.003000 | 0.002000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.023783 | 0.022953 | 0.684513 | 0.022953 | 0.022953 |
ZMB | Rural | 0.000000 | 0.000000 | 0.000000 | 0.001001 | 0.014014 | 0.005005 | 0.001001 | 0.0 | 0.155155 | 0.000000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.303562 | 0.000000 | 0.001464 | 0.000000 | 0.000000 |
Urban | 0.000000 | 0.000000 | 0.007000 | 0.039000 | 0.144000 | 0.172000 | 0.005000 | 0.0 | 0.010000 | 0.001000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 | |
ZWE | Rural | 0.000000 | 0.001001 | 0.002002 | 0.002002 | 0.040040 | 0.011011 | 0.000000 | 0.0 | 0.313313 | 0.001001 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.298805 | 0.000000 | 0.005200 | 0.000000 | 0.000000 |
Urban | 0.000000 | 0.001001 | 0.001001 | 0.007007 | 0.095095 | 0.845846 | 0.001001 | 0.0 | 0.007007 | 0.000000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.048473 | 0.004039 | 0.294893 | 0.004039 | 0.004039 |
451 rows × 23 columns
df_jmp_frac_glowpa_table_rur = df_jmp_frac_treatment_glowpa_table.xs("Rural", level = "context")
df_jmp_frac_glowpa_table_urb = df_jmp_frac_treatment_glowpa_table.xs("Urban", level = "context")
df_jmp_frac_glowpa_table_urb.head()
bucketLatrine | compostingToilet | flushOpen | flushPit | flushSeptic | flushSewer | flushUnknown | hangingToilet | openDefecation | other | ... | onsiteDumpedland | emptyFrequency | pitAdditive | urine | twinPits | coverBury | fecalSludgeTreated | sewageTreated | isWatertight | hasLeach | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
iso3 | |||||||||||||||||||||
ABW | 0.000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.007 | 0.002000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
AFG | 0.000 | 0.000000 | 0.010000 | 0.023000 | 0.226000 | 0.100000 | 0.000000 | 0.0 | 0.002 | 0.001000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
AGO | 0.004 | 0.000000 | 0.040000 | 0.000000 | 0.756000 | 0.117000 | 0.000000 | 0.0 | 0.078 | 0.005000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
AIA | 0.000 | 0.000000 | 0.000000 | 0.000000 | 0.939940 | 0.012012 | 0.000000 | 0.0 | 0.000 | 0.014014 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
ALB | 0.000 | 0.002002 | 0.009009 | 0.056056 | 0.014014 | 0.911912 | 0.005005 | 0.0 | 0.000 | 0.000000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.063935 | 0.037873 | 0.394653 | 0.037873 | 0.037873 |
5 rows × 23 columns
df_jmp_frac_glowpa_table_all = df_jmp_frac_glowpa_table_urb.merge(df_jmp_frac_glowpa_table_rur, on = "iso3" ,suffixes = ['_urb','_rur'])
df_jmp_frac_glowpa_table_all
bucketLatrine_urb | compostingToilet_urb | flushOpen_urb | flushPit_urb | flushSeptic_urb | flushSewer_urb | flushUnknown_urb | hangingToilet_urb | openDefecation_urb | other_urb | ... | onsiteDumpedland_rur | emptyFrequency_rur | pitAdditive_rur | urine_rur | twinPits_rur | coverBury_rur | fecalSludgeTreated_rur | sewageTreated_rur | isWatertight_rur | hasLeach_rur | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
iso3 | |||||||||||||||||||||
ABW | 0.000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.007000 | 0.002000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
AFG | 0.000 | 0.000000 | 0.010000 | 0.023000 | 0.226000 | 0.100000 | 0.000000 | 0.0 | 0.002000 | 0.001000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
AGO | 0.004 | 0.000000 | 0.040000 | 0.000000 | 0.756000 | 0.117000 | 0.000000 | 0.0 | 0.078000 | 0.005000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
AIA | 0.000 | 0.000000 | 0.000000 | 0.000000 | 0.939940 | 0.012012 | 0.000000 | 0.0 | 0.000000 | 0.014014 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
ALB | 0.000 | 0.002002 | 0.009009 | 0.056056 | 0.014014 | 0.911912 | 0.005005 | 0.0 | 0.000000 | 0.000000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.172107 | 0.101949 | 0.104372 | 0.101949 | 0.101949 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
WSM | 0.000 | 0.000000 | 0.000000 | 0.007014 | 0.896794 | 0.085170 | 0.002004 | 0.0 | 0.000000 | 0.001002 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.442523 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
YEM | 0.000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.016000 | 0.255000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.076267 | 0.024272 | 0.003894 | 0.024272 | 0.024272 |
ZAF | 0.010 | 0.000000 | 0.000000 | 0.000000 | 0.020000 | 0.853000 | 0.000000 | 0.0 | 0.003000 | 0.002000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.108661 | 0.104868 | 0.041806 | 0.104868 | 0.104868 |
ZMB | 0.000 | 0.000000 | 0.007000 | 0.039000 | 0.144000 | 0.172000 | 0.005000 | 0.0 | 0.010000 | 0.001000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.303562 | 0.000000 | 0.001464 | 0.000000 | 0.000000 |
ZWE | 0.000 | 0.001001 | 0.001001 | 0.007007 | 0.095095 | 0.845846 | 0.001001 | 0.0 | 0.007007 | 0.000000 | ... | 0.1 | 3 | 0 | 0 | 0 | 0.298805 | 0.000000 | 0.005200 | 0.000000 | 0.000000 |
225 rows × 46 columns
store the sources used for each iso3 and context
df_jmp_source_glowpa_table = df_jmp_glowpa_cleaned.reset_index().groupby(["iso3","context"])["source"].first().to_frame().reset_index().pivot(index = ["iso3"], columns = ["context"], values = "source")
df_jmp_source_glowpa_table.head()
context | Rural | Urban |
---|---|---|
iso3 | ||
ABW | ABW_2020_CEN | ABW_2020_CEN |
AFG | AFG_2020_IELFS | AFG_2020_IELFS |
AGO | AGO_2016_IIMS | AGO_2016_IIMS |
AIA | AIA_2009_CPA | AIA_2009_CPA |
ALB | ALB_2018_DHS | ALB_2018_DHS |
df_jmp_frac_glowpa_table_all.to_csv("glowpa_jmp_frac_level0_global.csv", float_format='%.3f')
df_jmp_source_glowpa_table.to_csv("glowpa_jmp_source_level0_global.csv")
df_jmp_frac_glowpa_table_all.index
Index(['ABW', 'AFG', 'AGO', 'AIA', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM',
...
'VGB', 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE'],
dtype='object', name='iso3', length=225)
In total 225 countries are covered with jmp sanitation data.