Skip to content

Commit 50b6f3a

Browse files
This completes the first smoke-test of the ITR_UI tool working with uncertaintains.
We do not yet actually plot error bars...that's next. But we can do all the calculations the GUI requires, which took some work. Signed-off-by: MichaelTiemann <[email protected]>
1 parent 9e64c82 commit 50b6f3a

File tree

9 files changed

+259
-128
lines changed

9 files changed

+259
-128
lines changed

ITR/data/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
from openscm_units import unit_registry
88
import re
99

10+
import numpy as np
11+
from uncertainties import ufloat
12+
1013
# openscm_units doesn't make it easy to set preprocessors. This is one way to do it.
1114
unit_registry.preprocessors=[
1215
lambda s1: re.sub(r'passenger.km', 'pkm', s1),
@@ -32,3 +35,5 @@
3235
# FIXME: delay loading of pint_pandas until after we've initialized ourselves
3336
from pint_pandas import PintType
3437
PintType.ureg = ureg
38+
39+
_ufloat_nan = ufloat(np.nan, 0.0)

ITR/data/base_providers.py

Lines changed: 118 additions & 74 deletions
Large diffs are not rendered by default.

ITR/data/data_warehouse.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from ITR.data.data_providers import CompanyDataProvider, ProductionBenchmarkDataProvider, IntensityBenchmarkDataProvider
1414
from ITR.configs import ColumnsConfig, TemperatureScoreConfig, LoggingConfig
1515

16+
import pint
17+
1618
logger = logging.getLogger(__name__)
1719
LoggingConfig.add_config_to_logger(logger)
1820

@@ -44,10 +46,10 @@ def __init__(self, company_data: CompanyDataProvider,
4446
# After projections have been made, shift S3 data into S1S2. If we shift before we project,
4547
# then S3 targets will not be projected correctly.
4648
for c in self.company_data._companies:
47-
if c.ghg_s3:
49+
if c.ghg_s3 and not unp.isnan(c.ghg_s3.m):
4850
# For Production-centric and energy-only data (except for Cement), convert all S3 numbers to S1 numbers
4951
c.ghg_s1s2 = c.ghg_s1s2 + c.ghg_s3
50-
c.ghg_s3 = Q_(0, c.ghg_s3.u)
52+
c.ghg_s3 = Q_(0.0, c.ghg_s3.u)
5153
if c.historic_data:
5254
if c.historic_data.emissions and c.historic_data.emissions.S3:
5355
c.historic_data.emissions.S1S2 = list( map(IEmissionRealization.add, c.historic_data.emissions.S1S2, c.historic_data.emissions.S3) )
@@ -82,12 +84,26 @@ def get_preprocessed_company_data(self, company_ids: List[str]) -> List[ICompany
8284
company_info_at_base_year).sort_index()
8385

8486
# trajectories are projected from historic data and we are careful to fill all gaps between historic and projections
87+
# FIXME: we just computed ALL company data above into a dataframe. Why not use that?
8588
projected_trajectories = self.company_data.get_company_projected_trajectories(company_ids)
8689
df_trajectory = self._get_cumulative_emissions(
8790
projected_ei=projected_trajectories,
8891
projected_production=projected_production).rename(self.column_config.CUMULATIVE_TRAJECTORY)
8992

93+
def fix_ragged_projected_targets(x):
94+
year = x.index[0]
95+
x_val = x[year]
96+
if unp.isnan(x_val.m):
97+
historic_ei_dict = { d['year']:d['value'] for d in df_company_data.loc[x.name].historic_data['emissions_intensities']['S1S2']}
98+
return historic_ei_dict[year]
99+
else:
100+
return x_val
101+
90102
projected_targets = self.company_data.get_company_projected_targets(company_ids)
103+
# Fill in ragged left edge of projected_targets with historic data, interpolating where we need to
104+
projected_targets[projected_targets.columns[0]] = (
105+
projected_targets[[projected_targets.columns[0]]].apply(fix_ragged_projected_targets, axis=1)
106+
)
91107
df_target = self._get_cumulative_emissions(
92108
projected_ei=projected_targets,
93109
projected_production=projected_production).rename(self.column_config.CUMULATIVE_TARGET)
@@ -134,7 +150,15 @@ def _get_cumulative_emissions(self, projected_ei: pd.DataFrame, projected_produc
134150
:return: cumulative emissions based on weighted sum of emissions intensity * production
135151
"""
136152
projected_emissions = projected_ei.multiply(projected_production)
137-
projected_emissions = projected_emissions.applymap(lambda x: np.nan if unp.isnan(x) else x)
138-
null_idx = projected_emissions.index[projected_emissions.isnull().all(axis=1)]
139-
return pd.concat([projected_emissions.loc[null_idx, projected_emissions.columns[0]],
153+
return projected_emissions.sum(axis=1).astype('pint[Mt CO2]')
154+
155+
# The following code is broken, due to the way unp.isnan straps away Quantity from scalars
156+
# It was written to rescue data from automotive, but maybe not needed anymore?
157+
nan_emissions = projected_emissions.applymap(lambda x: np.nan if unp.isnan(x) else x)
158+
if nan_emissions.isnull().any(axis=0).any():
159+
breakpoint()
160+
null_idx = nan_emissions.index[nan_emissions.isnull().all(axis=1)]
161+
# FIXME: this replaces the quantified NaNs in projected_emissions with straight-up NaNs,
162+
# while also converting the remaining emissions to a consistent unit of 'Mt CO2'
163+
return pd.concat([nan_emissions.loc[null_idx, nan_emissions.columns[0]],
140164
projected_emissions.loc[projected_emissions.index.difference(null_idx)].sum(axis=1)]).astype('pint[Mt CO2]')

ITR/data/template.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,16 @@ def _estimated_value(y: pd.Series) -> pint.Quantity:
7373
breakpoint()
7474
raise ValueError
7575
# This relies on the fact that we can now see Quantity(np.nan, ...) for both float and ufloat magnitudes
76-
x = y[~y.map(unp.isnan)]
76+
x = PA_._from_sequence(y)
77+
xq = x.quantity
78+
xm = xq.m
79+
x = y[~unp.isnan(PA_._from_sequence(y).quantity.m)]
7780
except TypeError:
7881
logger.error(f"type_error({y}) returning {y.values}[0]")
82+
breakpoint()
83+
x = PA_._from_sequence(y)
84+
xq = x.quantity
85+
xm = xq.m
7986
return y.iloc[0]
8087
if len(x) == 0:
8188
# If all inputs are NaN, return the first NaN
@@ -84,17 +91,11 @@ def _estimated_value(y: pd.Series) -> pint.Quantity:
8491
# If there's only one non-NaN input, return that one
8592
return x.iloc[0]
8693
if isinstance(x.values[0], pint.Quantity):
94+
from ITR.utils import umean
8795
values = x.values
8896
units = values[0].u
8997
assert all([v.u==units for v in values])
90-
values = np.array(list(map(lambda v: v.m if isinstance(v.m, UFloat) else ufloat(v.m, 0), values)))
91-
epsilon = unp.nominal_values(values).mean()/(2e13)
92-
wavg = ufloat(sum([v.n/(v.s**2+epsilon) for v in values])/sum([1/(v.s**2+epsilon) for v in values]),
93-
np.sqrt(len(values)/sum([1/(v.s**2+epsilon) for v in values])))
94-
if wavg.s <= np.sqrt(2*epsilon):
95-
if wavg.s > epsilon:
96-
logger.warning(f"Casting out small uncertainty {wavg.s} from {wavg}; epsilon = {epsilon}.")
97-
wavg = wavg.n
98+
wavg = umean(values)
9899
est = Q_(wavg, units)
99100
else:
100101
logger.error(f"non-qty: _estimated_values called on non-Quantity {x.values[0]};;;")
@@ -284,12 +285,12 @@ def _fixup_name(x):
284285
df3 = df2.reset_index().set_index(['company_id', 'variable', 'scope'])
285286
df3 = pd.concat([df3.xs(VariablesConfig.PRODUCTIONS, level=1, drop_level=False)
286287
.apply(
287-
lambda x: x.map(lambda y: Q_(y if y is not pd.NA else np.nan,
288+
lambda x: x.map(lambda y: Q_(float(y) if y is not pd.NA else np.nan,
288289
df_fundamentals.loc[df_fundamentals.company_id == x.name[0],
289290
'production_metric'].squeeze())), axis=1),
290291
df3.xs(VariablesConfig.EMISSIONS, level=1, drop_level=False)
291292
.apply(lambda x: x.map(
292-
lambda y: Q_(y if y is not pd.NA else np.nan,
293+
lambda y: Q_(float(y) if y is not pd.NA else np.nan,
293294
df_fundamentals.loc[df_fundamentals.company_id == x.name[0],
294295
'emissions_metric'].squeeze())), axis=1)])
295296
df4 = df3.xs(VariablesConfig.EMISSIONS, level=1) / df3.xs((VariablesConfig.PRODUCTIONS, 'production'),
@@ -306,7 +307,7 @@ def _fixup_name(x):
306307
with warnings.catch_warnings():
307308
warnings.simplefilter("ignore")
308309
for col in esg_year_columns:
309-
qty_col = df_esg[esg_has_units].apply(lambda x: Q_(np.nan if pd.isna(x[col]) else x[col], x['unit']), axis=1)
310+
qty_col = df_esg[esg_has_units].apply(lambda x: Q_(np.nan if pd.isna(x[col]) else float(x[col]), x['unit']), axis=1)
310311
df_esg[col] = df_esg[col].astype('object')
311312
df_esg.loc[df_esg[esg_has_units].index, col] = qty_col
312313
# FIXME: ...but we have to spill our units back to the corp table for now
@@ -472,6 +473,21 @@ def _company_df_to_model(self, df_fundamentals: pd.DataFrame,
472473
# TODO pull ghg_s1s2 and ghg_s3 from historic data as appropriate
473474

474475
if df_historic_data is not None:
476+
# FIXME: Is this the best place to finalize base_year_production, ghg_s1s2, and ghg_s3 data?
477+
# Something tells me these parameters should be removed in favor of querying historical data directly
478+
if not ColumnsConfig.BASE_YEAR_PRODUCTION in company_data:
479+
company_data[ColumnsConfig.BASE_YEAR_PRODUCTION] = df_historic_data.loc[
480+
company_data[ColumnsConfig.COMPANY_ID], 'Productions', 'production'][
481+
TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
482+
if not ColumnsConfig.GHG_SCOPE12 in company_data:
483+
company_data[ColumnsConfig.GHG_SCOPE12] = df_historic_data.loc[
484+
company_data[ColumnsConfig.COMPANY_ID], 'Emissions', 'S1S2'][
485+
TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
486+
if not ColumnsConfig.GHG_SCOPE3 in company_data:
487+
company_data[ColumnsConfig.GHG_SCOPE3] = df_historic_data.loc[
488+
company_data[ColumnsConfig.COMPANY_ID], 'Emissions', 'S3'][
489+
TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
490+
475491
company_data[ColumnsConfig.HISTORIC_DATA] = self._convert_historic_data(
476492
df_historic_data.loc[[company_data[ColumnsConfig.COMPANY_ID]]].reset_index()).dict()
477493
else:

ITR/interfaces.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,13 +202,12 @@ def __get_validators__(cls):
202202
@classmethod
203203
def validate(cls, value):
204204
if isinstance(value, str):
205-
v, u = value.split(' ', 1)
206205
try:
207-
q = Q_(float(v), u)
206+
q = Q_(value)
208207
except ValueError:
209208
raise ValueError(f"cannot convert '{quantity}' to quantity")
210209
quantity = q
211-
elif isinstance(value, Quantity):
210+
elif isinstance(value, pint.Quantity):
212211
quantity = value
213212
else:
214213
raise TypeError (f"quantity takes either a Q_ value or a string fully expressing a quantified value; got {value}")
@@ -652,7 +651,8 @@ class ICompanyEIProjection(BaseModel):
652651

653652
def add(self, o):
654653
assert self.year==o.year
655-
return IEIRealization(year=self.year, value = self.value + o.value)
654+
return IEIRealization(year=self.year,
655+
value = self.value + 0 if unp.isnan(o.value.m) else o.value)
656656

657657

658658
class ICompanyEIProjections(BaseModel):
@@ -690,7 +690,8 @@ class IEmissionRealization(BaseModel):
690690

691691
def add(self, o):
692692
assert self.year==o.year
693-
return IEmissionRealization(year=self.year, value = self.value + o.value)
693+
return IEmissionRealization(year=self.year,
694+
value = self.value + 0 if unp.isnan(o.value.m) else o.value)
694695

695696

696697
class IHistoricEmissionsScopes(BaseModel):
@@ -707,7 +708,8 @@ class IEIRealization(BaseModel):
707708

708709
def add(self, o):
709710
assert self.year==o.year
710-
return IEIRealization(year=self.year, value = self.value + o.value)
711+
return IEIRealization(year=self.year,
712+
value = self.value + 0 if unp.isnan(o.value.m) else o.value)
711713

712714

713715
class IHistoricEIScopes(BaseModel):
@@ -812,7 +814,7 @@ def _sector_to_production_units(self, sector, region="Global"):
812814
return units
813815

814816
def _get_base_realization_from_historic(self, realized_values: List[BaseModel], units, base_year=None):
815-
valid_realizations = [rv for rv in realized_values if rv.value is not None and not unp.isnan(rv.value)]
817+
valid_realizations = [rv for rv in realized_values if rv.value is not None and not unp.isnan(rv.value.magnitude)]
816818
if not valid_realizations:
817819
retval = realized_values[0].copy()
818820
retval.year = None
@@ -821,6 +823,7 @@ def _get_base_realization_from_historic(self, realized_values: List[BaseModel],
821823
if base_year and valid_realizations[0].year != base_year:
822824
retval = realized_values[0].copy()
823825
retval.year = base_year
826+
# FIXME: Unless and until we accept uncertainties as input, rather than computed data, we don't need to make this a UFloat here
824827
retval.value = Q_(np.nan, units)
825828
return retval
826829
return valid_realizations[0]
@@ -847,14 +850,16 @@ def __init__(self, emissions_metric=None, production_metric=None, base_year_prod
847850
self.emissions_metric = EmissionsMetric('t CO2')
848851
# TODO: Should raise a warning here
849852
base_year = None
853+
if self.base_year_production:
854+
pass
850855
# Right now historic_data comes in via template.py ESG data
851-
if self.historic_data and self.historic_data.productions:
856+
elif self.historic_data and self.historic_data.productions:
852857
# TODO: This is a hack to get things going.
853858
base_realization = self._get_base_realization_from_historic(self.historic_data.productions, str(self.production_metric), base_year)
854859
base_year = base_realization.year
855860
self.base_year_production = base_realization.value
856861
else:
857-
# raise ValueError(f"missing historic data for base_year_production for {self.company_name}")
862+
raise ValueError(f"missing historic data for base_year_production for {self.company_name}")
858863
self.base_year_production = Q_(np.nan, str(self.production_metric))
859864
if self.ghg_s1s2 is None and self.historic_data and self.historic_data.emissions:
860865
if self.historic_data.emissions.S1S2:
@@ -902,10 +907,10 @@ class ICompanyAggregates(ICompanyData):
902907
benchmark_temperature: quantity('delta_degC')
903908
benchmark_global_budget: EmissionsQuantity
904909

910+
# projected_production is computed but never saved, so computed at least 2x: initialiation/projection and cumulative budget
905911
# projected_targets: Optional[ICompanyEIProjectionsScopes]
906912
# projected_intensities: Optional[ICompanyEIProjectionsScopes]
907913

908-
909914
class TemperatureScoreControls(BaseModel):
910915
base_year: int
911916
target_end_year: int

ITR/portfolio_aggregation.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@
1111
import pint
1212
import pint_pandas
1313

14-
ureg = pint.get_application_registry()
15-
Q_ = ureg.Quantity
16-
PA_ = pint_pandas.PintArray
17-
1814
from .configs import PortfolioAggregationConfig, ColumnsConfig, logger
1915
from .interfaces import EScope
2016

ITR/temperature_score.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def get_score(self, scorable_row: pd.Series) -> Tuple[
6363

6464
# If only target data missing assign only trajectory_score to final score
6565
elif unp.isnan(scorable_row[self.c.COLS.CUMULATIVE_TARGET]) or scorable_row[self.c.COLS.CUMULATIVE_TARGET] == 0:
66+
breakpoint()
6667
target_overshoot_ratio = np.nan
6768
target_temperature_score = np.nan
6869
trajectory_overshoot_ratio = scorable_row[self.c.COLS.CUMULATIVE_TRAJECTORY] / scorable_row[
@@ -155,8 +156,7 @@ def _prepare_data(self, data: pd.DataFrame):
155156
lambda row: self.get_score(row), axis=1))
156157

157158
# Fix up dtypes for the new columns we just added
158-
for c in [self.c.COLS.TEMPERATURE_SCORE, self.c.COLS.TRAJECTORY_SCORE, self.c.COLS.TRAJECTORY_SCORE,
159-
self.c.COLS.TARGET_SCORE]:
159+
for c in [self.c.COLS.TEMPERATURE_SCORE, self.c.COLS.TRAJECTORY_SCORE, self.c.COLS.TARGET_SCORE]:
160160
scoring_data[c] = scoring_data[c].astype('pint[delta_degC]')
161161

162162
scoring_data = self.cap_scores(scoring_data)
@@ -169,11 +169,14 @@ def _calculate_company_score(self, data):
169169
:param data: The original data set as a pandas data frame
170170
:return: The data frame, with an updated s1s2s3 temperature score
171171
"""
172-
# Calculate the GHC
173-
company_data = data[
172+
from ITR.utils import umean
173+
174+
# Calculate the GHC--using umean to deal with uncertainties
175+
# FIXME: what about median vs. mean?
176+
company_data = umean(data[
174177
[self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE, self.c.COLS.GHG_SCOPE12,
175178
self.c.COLS.GHG_SCOPE3, self.c.COLS.TEMPERATURE_SCORE, self.c.SCORE_RESULT_TYPE]
176-
].groupby([self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE]).mean()
179+
].groupby([self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE]))
177180

178181
with warnings.catch_warnings():
179182
warnings.simplefilter("ignore")
@@ -230,15 +233,18 @@ def _get_aggregations(self, data: pd.DataFrame, total_companies: int) -> Tuple[A
230233
data[self.c.COLS.CONTRIBUTION] = weighted_scores
231234
with warnings.catch_warnings():
232235
warnings.simplefilter("ignore")
233-
contributions = data \
236+
data_contributions = data[['company_name', 'company_id', 'temperature_score', 'contribution_relative', 'contribution']] \
234237
.sort_values(self.c.COLS.CONTRIBUTION_RELATIVE, ascending=False) \
235238
.where(pd.notnull(data), 0) \
236239
.to_dict(orient="records")
240+
contribution_dicts = [{k:v if isinstance(v, str) else str(v)
241+
for k,v in contribution.items()}
242+
for contribution in data_contributions]
237243
aggregations = (Aggregation(
238244
score=weighted_scores.sum(),
239245
# proportion is not declared by anything to be a percent, so we make it a number from 0..1
240246
proportion=len(weighted_scores) / total_companies,
241-
contributions=[AggregationContribution.parse_obj(contribution) for contribution in contributions]), \
247+
contributions=[AggregationContribution.parse_obj(contribution) for contribution in contribution_dicts]), \
242248
data[self.c.COLS.CONTRIBUTION_RELATIVE], \
243249
data[self.c.COLS.CONTRIBUTION])
244250

ITR/utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from __future__ import annotations
22

33
import pandas as pd
4+
import numpy as np
45
from pathlib import Path
56
from typing import List, Optional, Tuple
67
from ITR.data.osc_units import ureg
78
import pint
9+
from uncertainties import unumpy as unp
10+
from uncertainties import ufloat, UFloat
811

912
from .interfaces import PortfolioCompany, EScope, ETimeFrames, ScoreAggregations, TemperatureScoreControls
1013
from .configs import ColumnsConfig, TemperatureScoreConfig, LoggingConfig, logger
@@ -127,3 +130,25 @@ def calculate(portfolio_data: pd.DataFrame, fallback_score: pint.Quantity['delta
127130
scores = ts.anonymize_data_dump(scores)
128131

129132
return scores, aggregations
133+
134+
135+
# https://stackoverflow.com/a/74137209/1291237
136+
def umean(quantified_data):
137+
"""
138+
Assuming Gaussian statistics, uncertainties stem from Gaussian parent distributions. In such a case,
139+
it is standard to weight the measurements (nominal values) by the inverse variance.
140+
141+
This function uses error propagation on the to get an uncertainty of the weighted average.
142+
:param: A set of uncertainty values
143+
:return: The weighted mean of the values, with a freshly calculated error term
144+
"""
145+
values = np.array(list(map(lambda v: v.m if isinstance(v.m, UFloat) else ufloat(v.m, 0), quantified_data)))
146+
epsilon = unp.nominal_values(values).mean()/(2e13)
147+
wavg = ufloat(sum([v.n/(v.s**2+epsilon) for v in values])/sum([1/(v.s**2+epsilon) for v in values]),
148+
np.sqrt(len(values)/sum([1/(v.s**2+epsilon) for v in values])))
149+
if wavg.s <= np.sqrt(2*epsilon):
150+
if wavg.s > epsilon:
151+
logger.debug(f"Casting out small uncertainty {wavg.s} from {wavg}; epsilon = {epsilon}.")
152+
wavg = wavg.n
153+
154+
return wavg

0 commit comments

Comments
 (0)