import pandas as pd
import numpy as np

Process JMP sanitation data and add the treatment data

First read the JMP processed sanitation surveys

df_jmp = pd.read_csv("jmp_sanitation_surveys.csv")
df_jmp
country alpha.2 alpha.3 numeric context classific_id classification percentage source
0 Afghanistan AF AFG 4 Urban 1 Flush and pour flush - to piped sewer system 0.000 AFG_1997_MICS
1 Afghanistan AF AFG 4 Urban 2 Flush and pour flush - to septic tank 0.000 AFG_1997_MICS
2 Afghanistan AF AFG 4 Urban 3 Flush and pour flush - to pit 0.000 AFG_1997_MICS
3 Afghanistan AF AFG 4 Urban 4 Flush and pour flush - to unknown place/ not s... 0.000 AFG_1997_MICS
4 Afghanistan AF AFG 4 Urban 5 Flush and pour flush - to elsewhere 0.000 AFG_1997_MICS
... ... ... ... ... ... ... ... ... ...
398849 Zimbabwe ZW ZWE 716 National 13 Composting toilets 0.007 ZWE_2022_CEN
398850 Zimbabwe ZW ZWE 716 National 14 Other improved 0.000 ZWE_2022_CEN
398851 Zimbabwe ZW ZWE 716 National 15 No facility, bush, field 0.198 ZWE_2022_CEN
398852 Zimbabwe ZW ZWE 716 National 16 Other unimproved 0.003 ZWE_2022_CEN
398853 Zimbabwe ZW ZWE 716 National 17 DK/missing information 0.000 ZWE_2022_CEN

398854 rows × 9 columns

df_jmp = df_jmp.rename(columns={'alpha.3':'iso3'}).drop(columns=["alpha.2","numeric"])
df_jmp['year'] = df_jmp['source'].str[4:8].apply(lambda x: int(x))
df_jmp = df_jmp.set_index(["source","context","classific_id"])
df_jmp.head()
country iso3 classification percentage year
source context classific_id
AFG_1997_MICS Urban 1 Afghanistan AFG Flush and pour flush - to piped sewer system 0.0 1997
2 Afghanistan AFG Flush and pour flush - to septic tank 0.0 1997
3 Afghanistan AFG Flush and pour flush - to pit 0.0 1997
4 Afghanistan AFG Flush and pour flush - to unknown place/ not s... 0.0 1997
5 Afghanistan AFG Flush and pour flush - to elsewhere 0.0 1997

The GloWPa model uses different sanitation categories. In the next steps the JPM sanitation categories are mapped to the GloWPa sanitation categories.

df_harmon = pd.DataFrame({
    'classific_id': range(1, 18),
    'san': ["flushSewer", "flushSeptic", "flushPit", "flushUnknown", "flushOpen",
            "pitSlab", "pitSlab", "pitSlab", "pitNoSlab", "hangingToilet", "bucketLatrine",
            "other", "compostingToilet", "pitSlab", "openDefecation", "other", "other"]
}).set_index("classific_id")
df_jmp = df_jmp.join(df_harmon, on = "classific_id", how = "left")
df_jmp.head(10)
country iso3 classification percentage year san
source context classific_id
AFG_1997_MICS Urban 1 Afghanistan AFG Flush and pour flush - to piped sewer system 0.000 1997 flushSewer
2 Afghanistan AFG Flush and pour flush - to septic tank 0.000 1997 flushSeptic
3 Afghanistan AFG Flush and pour flush - to pit 0.000 1997 flushPit
4 Afghanistan AFG Flush and pour flush - to unknown place/ not s... 0.000 1997 flushUnknown
5 Afghanistan AFG Flush and pour flush - to elsewhere 0.000 1997 flushOpen
6 Afghanistan AFG Dry latrines - Ventilated Improved Pit latrine 0.000 1997 pitSlab
7 Afghanistan AFG Dry latrines - Pit latrine with slab/covered l... 0.017 1997 pitSlab
8 Afghanistan AFG Dry latrines - Traditional latrine 0.171 1997 pitSlab
9 Afghanistan AFG Dry latrines - Pit latrine without slab/open pit 0.522 1997 pitNoSlab
10 Afghanistan AFG Dry latrines - Hanging toilet/hanging latrine 0.000 1997 hangingToilet

Now, we need to check if the sanitation fractions from the surveys are complete for each survey (source) and at national, urban and rural level (context).

df_source_sums = df_jmp.groupby(["source","context"])["percentage"].sum()
df_source_sums.sort_values()
source          context 
KGZ_2008_WW     Rural       0.000
LVA_2015_PWH    Urban       0.000
LVA_2016_ES     Rural       0.000
                Urban       0.000
LVA_2016_ESDW   National    0.000
                            ...  
NGA_2006_GHS    Rural       1.344
MEX_1998_ENIGH  Rural       1.394
MEX_1996_ENIGH  Rural       1.402
EST_2006_HBS    Rural       1.429
MEX_1992_ENIGH  Rural       1.471
Name: percentage, Length: 23462, dtype: float64

We include all surveys when the total fraction is between 0.95 and 1.05. Later these fractions will be normalized.

complete_threshold_lower  = 0.95
complete_threshold_upper = 1.05
df_complete_sources = df_source_sums[(df_source_sums > complete_threshold_lower) & (df_source_sums < complete_threshold_upper)]
df_complete_sources.sort_values()
source           context 
MDG_2005_EP      National    0.950
TZA_1996_DHS     Urban       0.950
MRT_2000_EPCV    Rural       0.950
MNG_2010_HSES    Rural       0.951
RUS_2003_WHS     Rural       0.951
                             ...  
CHN_2003_NHSS    Rural       1.043
MNG_2015_HSES    Rural       1.046
DEU_2012_EUSILC  National    1.048
MEX_2008_ENIGH   Urban       1.048
BRA_2008_PNAD    Rural       1.049
Name: percentage, Length: 6896, dtype: float64
df_jmp_complete = df_jmp.reset_index("classific_id").loc[df_complete_sources.index]
df_jmp_complete.head()
classific_id country iso3 classification percentage year san
source context
ABW_1990_CEN National 1 Aruba ABW Flush and pour flush - to piped sewer system 0.132 1990 flushSewer
National 2 Aruba ABW Flush and pour flush - to septic tank 0.823 1990 flushSeptic
National 3 Aruba ABW Flush and pour flush - to pit 0.000 1990 flushPit
National 4 Aruba ABW Flush and pour flush - to unknown place/ not s... 0.000 1990 flushUnknown
National 5 Aruba ABW Flush and pour flush - to elsewhere 0.000 1990 flushOpen

Now for each country and context, we need the most recent year of complete data

recent_year_idx = df_jmp_complete.reset_index().groupby(["iso3","context"])["year"].idxmax()
idx_complete = df_jmp_complete.iloc[recent_year_idx].index
idx_complete
MultiIndex([(  'ABW_2020_CEN', 'National'),
            ('AFG_2020_IELFS', 'National'),
            ('AFG_2020_IELFS',    'Rural'),
            ('AFG_2020_IELFS',    'Urban'),
            ( 'AGO_2016_IIMS', 'National'),
            ( 'AGO_2016_IIMS',    'Rural'),
            ( 'AGO_2016_IIMS',    'Urban'),
            (  'AIA_2009_CPA', 'National'),
            (  'AIA_2009_CPA',    'Urban'),
            (  'ALB_2018_DHS', 'National'),
            ...
            (  'YEM_2019_WFP',    'Urban'),
            (  'ZAF_2021_GHS', 'National'),
            (  'ZAF_2021_GHS',    'Rural'),
            (  'ZAF_2021_GHS',    'Urban'),
            (  'ZMB_2018_DHS', 'National'),
            (  'ZMB_2018_DHS',    'Rural'),
            (  'ZMB_2018_DHS',    'Urban'),
            ( 'ZWE_2019_MICS', 'National'),
            ( 'ZWE_2019_MICS',    'Rural'),
            ( 'ZWE_2019_MICS',    'Urban')],
           names=['source', 'context'], length=619)
df_jmp_complete_recent = df_jmp_complete.loc[idx_complete]
df_jmp_complete_recent
classific_id country iso3 classification percentage year san
source context
ABW_2020_CEN National 1 Aruba ABW Flush and pour flush - to piped sewer system 0.000 2020 flushSewer
National 2 Aruba ABW Flush and pour flush - to septic tank 0.000 2020 flushSeptic
National 3 Aruba ABW Flush and pour flush - to pit 0.000 2020 flushPit
National 4 Aruba ABW Flush and pour flush - to unknown place/ not s... 0.000 2020 flushUnknown
National 5 Aruba ABW Flush and pour flush - to elsewhere 0.000 2020 flushOpen
... ... ... ... ... ... ... ... ...
ZWE_2019_MICS Urban 13 Zimbabwe ZWE Composting toilets 0.001 2019 compostingToilet
Urban 14 Zimbabwe ZWE Other improved 0.000 2019 pitSlab
Urban 15 Zimbabwe ZWE No facility, bush, field 0.007 2019 openDefecation
Urban 16 Zimbabwe ZWE Other unimproved 0.000 2019 other
Urban 17 Zimbabwe ZWE DK/missing information 0.000 2019 other

10523 rows × 7 columns

Now normalize the fractions so all fractions will sum up to one

df_jmp_complete_recent = df_jmp_complete_recent.join(df_complete_sources.rename('sum_frac'))
df_jmp_complete_recent['frac'] = df_jmp_complete_recent['percentage'] / df_jmp_complete_recent['sum_frac']
df_jmp_complete_recent.head()
classific_id country iso3 classification percentage year san sum_frac frac
source context
ABW_2020_CEN National 1 Aruba ABW Flush and pour flush - to piped sewer system 0.0 2020 flushSewer 1.0 0.0
National 2 Aruba ABW Flush and pour flush - to septic tank 0.0 2020 flushSeptic 1.0 0.0
National 3 Aruba ABW Flush and pour flush - to pit 0.0 2020 flushPit 1.0 0.0
National 4 Aruba ABW Flush and pour flush - to unknown place/ not s... 0.0 2020 flushUnknown 1.0 0.0
National 5 Aruba ABW Flush and pour flush - to elsewhere 0.0 2020 flushOpen 1.0 0.0
df_jmp_recent_agg = df_jmp_complete_recent.reset_index().groupby(["iso3","context","san"])[['frac','source']].agg({'frac':'sum','source':'first'})
df_jmp_recent_agg
frac source
iso3 context san
ABW National bucketLatrine 0.000000 ABW_2020_CEN
compostingToilet 0.000000 ABW_2020_CEN
flushOpen 0.000000 ABW_2020_CEN
flushPit 0.000000 ABW_2020_CEN
flushSeptic 0.000000 ABW_2020_CEN
... ... ... ... ...
ZWE Urban hangingToilet 0.000000 ZWE_2019_MICS
openDefecation 0.007007 ZWE_2019_MICS
other 0.000000 ZWE_2019_MICS
pitNoSlab 0.007007 ZWE_2019_MICS
pitSlab 0.035035 ZWE_2019_MICS

7428 rows × 2 columns

Create a empty data.frame with all countries (iso3), sanitation types (san) and context. Update the rows with most complete and recent surveys. There will be rows with missing data.

iso3_vals = df_jmp['iso3'].unique()
context = ["Urban","Rural"]
sani_type = df_harmon['san'].unique()
df_jmp_glowpa = pd.DataFrame({"frac": np.nan, "source":None},index = pd.MultiIndex.from_product([iso3_vals,context,sani_type], names=["iso3","context","san"]))
df_jmp_glowpa.update(df_jmp_recent_agg, join="left")
df_jmp_glowpa.head()
frac source
iso3 context san
AFG Urban flushSewer 0.100 AFG_2020_IELFS
flushSeptic 0.226 AFG_2020_IELFS
flushPit 0.023 AFG_2020_IELFS
flushUnknown 0.000 AFG_2020_IELFS
flushOpen 0.010 AFG_2020_IELFS

Now, manipulate the national data and use these values when urban and rural sanitation fractions are missing.

# slice the most recent and complate national data
df_jmp_recent_agg_nat = df_jmp_recent_agg.xs("National", level = "context")
# manipulate the context, so we can use these indices to update the final data.frame.
df_jmp_recent_agg_nat['context'] = 'Urban'
df_jmp_recent_agg_nat2 = df_jmp_recent_agg_nat.copy()
# do same for rural
df_jmp_recent_agg_nat2['context'] = 'Rural'
df_jmp_recent_agg_nat = pd.concat([df_jmp_recent_agg_nat, df_jmp_recent_agg_nat2]).reset_index().set_index(['iso3','context','san'])
df_jmp_recent_agg_nat
/tmp/ipykernel_45892/252835684.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jmp_recent_agg_nat['context'] = 'Urban'
frac source
iso3 context san
ABW Urban bucketLatrine 0.000000 ABW_2020_CEN
compostingToilet 0.000000 ABW_2020_CEN
flushOpen 0.000000 ABW_2020_CEN
flushPit 0.000000 ABW_2020_CEN
flushSeptic 0.000000 ABW_2020_CEN
... ... ... ... ...
ZWE Rural hangingToilet 0.000000 ZWE_2019_MICS
openDefecation 0.216783 ZWE_2019_MICS
other 0.000999 ZWE_2019_MICS
pitNoSlab 0.092907 ZWE_2019_MICS
pitSlab 0.351648 ZWE_2019_MICS

5400 rows × 2 columns

count the number of NA

df_jmp_glowpa.isna().sum()
frac      816
source    816
dtype: int64

join national data when urban and/or rural values are missing

df_jmp_glowpa.update(df_jmp_recent_agg_nat, join="left", overwrite=False)

count the number of NA again

df_jmp_glowpa.isna().sum()
frac      132
source    132
dtype: int64

Check which indices have missing values.

df_jmp_glowpa[df_jmp_glowpa.isna().any(axis = 1)].reset_index('san').index.unique()
MultiIndex([('BWA', 'Rural'),
            ('COK', 'Urban'),
            ('COK', 'Rural'),
            ('GNQ', 'Urban'),
            ('GNQ', 'Rural'),
            ('NCL', 'Urban'),
            ('NCL', 'Rural'),
            ('SPM', 'Urban'),
            ('SPM', 'Rural'),
            ('TKL', 'Urban'),
            ('TKL', 'Rural')],
           names=['iso3', 'context'])

BWA and GNQ could adopt the values from either Rural or Urban. For we drop them. validate the final sums.

df_jmp_glowpa_cleaned = df_jmp_glowpa.dropna()
(df_jmp_glowpa_cleaned.groupby(["iso3","context"])["frac"].sum() > 1.01).sum()
0
(df_jmp_glowpa_cleaned.groupby(["iso3","context"])["frac"].sum() < 0.99).sum()
0

Read the treatment dataset.

df_treatment = pd.read_csv("jmp_treatment.csv", sep = ",")
df_treatment = df_treatment.rename(columns={"ISO3":"iso3","Residence Type":"context"}).set_index(["iso3","context","Service Level"])
df_treatment
Year Service Type Coverage Population
iso3 context Service Level
ABW total Sewage treated 2022 Sanitation 4.672102506950807 4973.219513523787
ALB total Faecal sludge treated 2022 Sanitation 10.19487443466163 289771.05698001874
Disposed insitu 2022 Sanitation 17.21067442717733 489182.613485291
Sewage treated 2022 Sanitation 28.95681524326932 823045.640590645
AND total Faecal sludge treated 2022 Sanitation 0 0
... ... ... ... ... ... ...
ZAF urban Disposed insitu 2022 Sanitation 2.378316688634978 973409.0605099642
Sewage treated 2022 Sanitation 68.4513130834805 28016087.461214747
ZWE urban Faecal sludge treated 2022 Sanitation 0.4039044457537767 21354.58153069156
Disposed insitu 2022 Sanitation 4.847340805559787 256280.75037945205
Sewage treated 2022 Sanitation 29.4893300372857 1559112.0850167093

1085 rows × 4 columns

Drop some headers which were read as data rows.

df_treatment = df_treatment.drop([("ISO3","Residence Type","Service Level")])
df_treatment
Year Service Type Coverage Population
iso3 context Service Level
ABW total Sewage treated 2022 Sanitation 4.672102506950807 4973.219513523787
ALB total Faecal sludge treated 2022 Sanitation 10.19487443466163 289771.05698001874
Disposed insitu 2022 Sanitation 17.21067442717733 489182.613485291
Sewage treated 2022 Sanitation 28.95681524326932 823045.640590645
AND total Faecal sludge treated 2022 Sanitation 0 0
... ... ... ... ... ... ...
ZAF urban Disposed insitu 2022 Sanitation 2.378316688634978 973409.0605099642
Sewage treated 2022 Sanitation 68.4513130834805 28016087.461214747
ZWE urban Faecal sludge treated 2022 Sanitation 0.4039044457537767 21354.58153069156
Disposed insitu 2022 Sanitation 4.847340805559787 256280.75037945205
Sewage treated 2022 Sanitation 29.4893300372857 1559112.0850167093

1083 rows × 4 columns

Transform percentages into fractions.

df_service = df_treatment.reset_index().pivot(index=["iso3","context"],columns="Service Level", values = "Coverage")
df_service.rename(index={"total":"National","urban":"Urban","rural": "Rural"}, level="context", inplace=True)
df_service.rename(columns={"Disposed insitu":"coverBury","Faecal sludge treated":"fecalSludgeTreated","Sewage treated":"sewageTreated"}, inplace = True)
df_service["coverBury"] = df_service["coverBury"].astype(float) / 100
df_service["fecalSludgeTreated"] = df_service["fecalSludgeTreated"].astype(float) / 100
df_service["sewageTreated"] = df_service["sewageTreated"].astype(float) / 100
df_service_subnat = df_service.loc[(slice(None) ,["Urban","Rural"]),:]
df_service_subnat
Service Level coverBury fecalSludgeTreated sewageTreated
iso3 context
ALB Urban 0.063935 0.037873 0.394653
AND Urban 0.000000 0.000000 1.000000
ARE Urban 0.024618 0.024618 0.935318
ARG Urban 0.115030 0.042154 0.304929
ARM Urban 0.000000 0.000000 0.000000
... ... ... ... ...
XKX Rural 0.019912 0.007235 0.006454
YEM Rural NaN NaN 0.003894
ZAF Rural NaN NaN 0.041806
ZMB Rural 0.303562 0.000000 0.001464
ZWE Rural 0.298805 0.000000 0.005200

245 rows × 3 columns

Update missing data for urban and rural areas with national data if present.

df_service_nat = df_service.loc[(slice(None) ,["National"]),:]
df_service_subnat.update(df_service_nat.rename(index={"National":"Rural"}, level="context"), overwrite=False)
df_service_subnat.update(df_service_nat.rename(index={"National":"Urban"}, level="context"), overwrite=False)
df_service_subnat
Service Level coverBury fecalSludgeTreated sewageTreated
iso3 context
ALB Urban 0.063935 0.037873 0.394653
AND Urban 0.000000 0.000000 1.000000
ARE Urban 0.024618 0.024618 0.935318
ARG Urban 0.115030 0.042154 0.304929
ARM Urban 0.000000 0.000000 0.000000
... ... ... ... ...
XKX Rural 0.019912 0.007235 0.006454
YEM Rural 0.076267 0.024272 0.003894
ZAF Rural 0.108661 0.104868 0.041806
ZMB Rural 0.303562 0.000000 0.001464
ZWE Rural 0.298805 0.000000 0.005200

245 rows × 3 columns

Here we add some required constants to the sanitation data.

df_jmp_frac_glowpa_table = df_jmp_glowpa_cleaned.reset_index().pivot(index=['iso3','context'], columns='san', values='frac')
# The GloWPa model expects the containerBased sanitation type, but it is not provided by the JMP surveys.
df_jmp_frac_glowpa_table["containerBased"] = np.nan
df_jmp_frac_glowpa_table["onsiteDumpedland"] = 0.1
df_jmp_frac_glowpa_table["emptyFrequency"] = 3
df_jmp_frac_glowpa_table["pitAdditive"] = 0
df_jmp_frac_glowpa_table["urine"] = 0
df_jmp_frac_glowpa_table["twinPits"] = 0
df_jmp_frac_glowpa_table.head()
san bucketLatrine compostingToilet flushOpen flushPit flushSeptic flushSewer flushUnknown hangingToilet openDefecation other pitNoSlab pitSlab containerBased onsiteDumpedland emptyFrequency pitAdditive urine twinPits
iso3 context
ABW Rural 0.000000 0.0 0.000000 0.000 0.000000 0.000000 0.0 0.0 0.007000 0.002000 0.000 0.991 NaN 0.1 3 0 0 0
Urban 0.000000 0.0 0.000000 0.000 0.000000 0.000000 0.0 0.0 0.007000 0.002000 0.000 0.991 NaN 0.1 3 0 0 0
AFG Rural 0.000000 0.0 0.003000 0.007 0.019000 0.017000 0.0 0.0 0.149000 0.006000 0.222 0.577 NaN 0.1 3 0 0 0
Urban 0.000000 0.0 0.010000 0.023 0.226000 0.100000 0.0 0.0 0.002000 0.001000 0.152 0.486 NaN 0.1 3 0 0 0
AGO Rural 0.001998 0.0 0.075924 0.000 0.270729 0.022977 0.0 0.0 0.627373 0.000999 0.000 0.000 NaN 0.1 3 0 0 0
# merge treatment
df_jmp_frac_treatment_glowpa_table = df_jmp_frac_glowpa_table.join(df_service_subnat, on=["iso3","context"], how="left")
df_jmp_frac_treatment_glowpa_table
bucketLatrine compostingToilet flushOpen flushPit flushSeptic flushSewer flushUnknown hangingToilet openDefecation other ... pitSlab containerBased onsiteDumpedland emptyFrequency pitAdditive urine twinPits coverBury fecalSludgeTreated sewageTreated
iso3 context
ABW Rural 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.007000 0.002000 ... 0.991000 NaN 0.1 3 0 0 0 NaN NaN NaN
Urban 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.007000 0.002000 ... 0.991000 NaN 0.1 3 0 0 0 NaN NaN NaN
AFG Rural 0.000000 0.000000 0.003000 0.007000 0.019000 0.017000 0.000000 0.0 0.149000 0.006000 ... 0.577000 NaN 0.1 3 0 0 0 NaN NaN NaN
Urban 0.000000 0.000000 0.010000 0.023000 0.226000 0.100000 0.000000 0.0 0.002000 0.001000 ... 0.486000 NaN 0.1 3 0 0 0 NaN NaN NaN
AGO Rural 0.001998 0.000000 0.075924 0.000000 0.270729 0.022977 0.000000 0.0 0.627373 0.000999 ... 0.000000 NaN 0.1 3 0 0 0 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZAF Urban 0.010000 0.000000 0.000000 0.000000 0.020000 0.853000 0.000000 0.0 0.003000 0.002000 ... 0.112000 NaN 0.1 3 0 0 0 0.023783 0.022953 0.684513
ZMB Rural 0.000000 0.000000 0.000000 0.001001 0.014014 0.005005 0.001001 0.0 0.155155 0.000000 ... 0.356356 NaN 0.1 3 0 0 0 0.303562 0.000000 0.001464
Urban 0.000000 0.000000 0.007000 0.039000 0.144000 0.172000 0.005000 0.0 0.010000 0.001000 ... 0.430000 NaN 0.1 3 0 0 0 NaN NaN NaN
ZWE Rural 0.000000 0.001001 0.002002 0.002002 0.040040 0.011011 0.000000 0.0 0.313313 0.001001 ... 0.497497 NaN 0.1 3 0 0 0 0.298805 0.000000 0.005200
Urban 0.000000 0.001001 0.001001 0.007007 0.095095 0.845846 0.001001 0.0 0.007007 0.000000 ... 0.035035 NaN 0.1 3 0 0 0 0.048473 0.004039 0.294893

451 rows × 21 columns

# manipulate nan
df_jmp_frac_treatment_glowpa_table[["coverBury","fecalSludgeTreated"]] = df_jmp_frac_treatment_glowpa_table[["coverBury","fecalSludgeTreated"]].fillna(value = 0)
df_jmp_frac_treatment_glowpa_table["isWatertight"] = df_jmp_frac_treatment_glowpa_table["fecalSludgeTreated"]
df_jmp_frac_treatment_glowpa_table["hasLeach"] = df_jmp_frac_treatment_glowpa_table["fecalSludgeTreated"]
df_jmp_frac_treatment_glowpa_table

bucketLatrine compostingToilet flushOpen flushPit flushSeptic flushSewer flushUnknown hangingToilet openDefecation other ... onsiteDumpedland emptyFrequency pitAdditive urine twinPits coverBury fecalSludgeTreated sewageTreated isWatertight hasLeach
iso3 context
ABW Rural 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.007000 0.002000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
Urban 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.007000 0.002000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AFG Rural 0.000000 0.000000 0.003000 0.007000 0.019000 0.017000 0.000000 0.0 0.149000 0.006000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
Urban 0.000000 0.000000 0.010000 0.023000 0.226000 0.100000 0.000000 0.0 0.002000 0.001000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AGO Rural 0.001998 0.000000 0.075924 0.000000 0.270729 0.022977 0.000000 0.0 0.627373 0.000999 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZAF Urban 0.010000 0.000000 0.000000 0.000000 0.020000 0.853000 0.000000 0.0 0.003000 0.002000 ... 0.1 3 0 0 0 0.023783 0.022953 0.684513 0.022953 0.022953
ZMB Rural 0.000000 0.000000 0.000000 0.001001 0.014014 0.005005 0.001001 0.0 0.155155 0.000000 ... 0.1 3 0 0 0 0.303562 0.000000 0.001464 0.000000 0.000000
Urban 0.000000 0.000000 0.007000 0.039000 0.144000 0.172000 0.005000 0.0 0.010000 0.001000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
ZWE Rural 0.000000 0.001001 0.002002 0.002002 0.040040 0.011011 0.000000 0.0 0.313313 0.001001 ... 0.1 3 0 0 0 0.298805 0.000000 0.005200 0.000000 0.000000
Urban 0.000000 0.001001 0.001001 0.007007 0.095095 0.845846 0.001001 0.0 0.007007 0.000000 ... 0.1 3 0 0 0 0.048473 0.004039 0.294893 0.004039 0.004039

451 rows × 23 columns

df_jmp_frac_glowpa_table_rur = df_jmp_frac_treatment_glowpa_table.xs("Rural", level = "context")
df_jmp_frac_glowpa_table_urb = df_jmp_frac_treatment_glowpa_table.xs("Urban", level = "context")
df_jmp_frac_glowpa_table_urb.head()
bucketLatrine compostingToilet flushOpen flushPit flushSeptic flushSewer flushUnknown hangingToilet openDefecation other ... onsiteDumpedland emptyFrequency pitAdditive urine twinPits coverBury fecalSludgeTreated sewageTreated isWatertight hasLeach
iso3
ABW 0.000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.007 0.002000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AFG 0.000 0.000000 0.010000 0.023000 0.226000 0.100000 0.000000 0.0 0.002 0.001000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AGO 0.004 0.000000 0.040000 0.000000 0.756000 0.117000 0.000000 0.0 0.078 0.005000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AIA 0.000 0.000000 0.000000 0.000000 0.939940 0.012012 0.000000 0.0 0.000 0.014014 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
ALB 0.000 0.002002 0.009009 0.056056 0.014014 0.911912 0.005005 0.0 0.000 0.000000 ... 0.1 3 0 0 0 0.063935 0.037873 0.394653 0.037873 0.037873

5 rows × 23 columns

df_jmp_frac_glowpa_table_all = df_jmp_frac_glowpa_table_urb.merge(df_jmp_frac_glowpa_table_rur, on = "iso3" ,suffixes = ['_urb','_rur'])
df_jmp_frac_glowpa_table_all
bucketLatrine_urb compostingToilet_urb flushOpen_urb flushPit_urb flushSeptic_urb flushSewer_urb flushUnknown_urb hangingToilet_urb openDefecation_urb other_urb ... onsiteDumpedland_rur emptyFrequency_rur pitAdditive_rur urine_rur twinPits_rur coverBury_rur fecalSludgeTreated_rur sewageTreated_rur isWatertight_rur hasLeach_rur
iso3
ABW 0.000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.007000 0.002000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AFG 0.000 0.000000 0.010000 0.023000 0.226000 0.100000 0.000000 0.0 0.002000 0.001000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AGO 0.004 0.000000 0.040000 0.000000 0.756000 0.117000 0.000000 0.0 0.078000 0.005000 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
AIA 0.000 0.000000 0.000000 0.000000 0.939940 0.012012 0.000000 0.0 0.000000 0.014014 ... 0.1 3 0 0 0 0.000000 0.000000 NaN 0.000000 0.000000
ALB 0.000 0.002002 0.009009 0.056056 0.014014 0.911912 0.005005 0.0 0.000000 0.000000 ... 0.1 3 0 0 0 0.172107 0.101949 0.104372 0.101949 0.101949
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
WSM 0.000 0.000000 0.000000 0.007014 0.896794 0.085170 0.002004 0.0 0.000000 0.001002 ... 0.1 3 0 0 0 0.442523 0.000000 0.000000 0.000000 0.000000
YEM 0.000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.016000 0.255000 ... 0.1 3 0 0 0 0.076267 0.024272 0.003894 0.024272 0.024272
ZAF 0.010 0.000000 0.000000 0.000000 0.020000 0.853000 0.000000 0.0 0.003000 0.002000 ... 0.1 3 0 0 0 0.108661 0.104868 0.041806 0.104868 0.104868
ZMB 0.000 0.000000 0.007000 0.039000 0.144000 0.172000 0.005000 0.0 0.010000 0.001000 ... 0.1 3 0 0 0 0.303562 0.000000 0.001464 0.000000 0.000000
ZWE 0.000 0.001001 0.001001 0.007007 0.095095 0.845846 0.001001 0.0 0.007007 0.000000 ... 0.1 3 0 0 0 0.298805 0.000000 0.005200 0.000000 0.000000

225 rows × 46 columns

store the sources used for each iso3 and context

df_jmp_source_glowpa_table = df_jmp_glowpa_cleaned.reset_index().groupby(["iso3","context"])["source"].first().to_frame().reset_index().pivot(index = ["iso3"], columns = ["context"], values = "source")
df_jmp_source_glowpa_table.head()
context Rural Urban
iso3
ABW ABW_2020_CEN ABW_2020_CEN
AFG AFG_2020_IELFS AFG_2020_IELFS
AGO AGO_2016_IIMS AGO_2016_IIMS
AIA AIA_2009_CPA AIA_2009_CPA
ALB ALB_2018_DHS ALB_2018_DHS
df_jmp_frac_glowpa_table_all.to_csv("glowpa_jmp_frac_level0_global.csv", float_format='%.3f')
df_jmp_source_glowpa_table.to_csv("glowpa_jmp_source_level0_global.csv")
df_jmp_frac_glowpa_table_all.index
Index(['ABW', 'AFG', 'AGO', 'AIA', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM',
       ...
       'VGB', 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE'],
      dtype='object', name='iso3', length=225)

In total 225 countries are covered with jmp sanitation data.