Skip to content

Commit

Permalink
This completes the first smoke-test of the ITR_UI tool working with u…
Browse files Browse the repository at this point in the history
…ncertaintains.

We do not yet actually plot error bars...that's next.  But we can do all the calculations
the GUI requires, which took some work.

Signed-off-by: MichaelTiemann <[email protected]>
  • Loading branch information
MichaelTiemannOSC committed Oct 23, 2022
1 parent 9e64c82 commit 50b6f3a
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 128 deletions.
5 changes: 5 additions & 0 deletions ITR/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from openscm_units import unit_registry
import re

import numpy as np
from uncertainties import ufloat

# openscm_units doesn't make it easy to set preprocessors. This is one way to do it.
unit_registry.preprocessors=[
lambda s1: re.sub(r'passenger.km', 'pkm', s1),
Expand All @@ -32,3 +35,5 @@
# FIXME: delay loading of pint_pandas until after we've initialized ourselves
from pint_pandas import PintType
PintType.ureg = ureg

_ufloat_nan = ufloat(np.nan, 0.0)
192 changes: 118 additions & 74 deletions ITR/data/base_providers.py

Large diffs are not rendered by default.

34 changes: 29 additions & 5 deletions ITR/data/data_warehouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from ITR.data.data_providers import CompanyDataProvider, ProductionBenchmarkDataProvider, IntensityBenchmarkDataProvider
from ITR.configs import ColumnsConfig, TemperatureScoreConfig, LoggingConfig

import pint

logger = logging.getLogger(__name__)
LoggingConfig.add_config_to_logger(logger)

Expand Down Expand Up @@ -44,10 +46,10 @@ def __init__(self, company_data: CompanyDataProvider,
# After projections have been made, shift S3 data into S1S2. If we shift before we project,
# then S3 targets will not be projected correctly.
for c in self.company_data._companies:
if c.ghg_s3:
if c.ghg_s3 and not unp.isnan(c.ghg_s3.m):
# For Production-centric and energy-only data (except for Cement), convert all S3 numbers to S1 numbers
c.ghg_s1s2 = c.ghg_s1s2 + c.ghg_s3
c.ghg_s3 = Q_(0, c.ghg_s3.u)
c.ghg_s3 = Q_(0.0, c.ghg_s3.u)
if c.historic_data:
if c.historic_data.emissions and c.historic_data.emissions.S3:
c.historic_data.emissions.S1S2 = list( map(IEmissionRealization.add, c.historic_data.emissions.S1S2, c.historic_data.emissions.S3) )
Expand Down Expand Up @@ -82,12 +84,26 @@ def get_preprocessed_company_data(self, company_ids: List[str]) -> List[ICompany
company_info_at_base_year).sort_index()

# trajectories are projected from historic data and we are careful to fill all gaps between historic and projections
# FIXME: we just computed ALL company data above into a dataframe. Why not use that?
projected_trajectories = self.company_data.get_company_projected_trajectories(company_ids)
df_trajectory = self._get_cumulative_emissions(
projected_ei=projected_trajectories,
projected_production=projected_production).rename(self.column_config.CUMULATIVE_TRAJECTORY)

def fix_ragged_projected_targets(x):
year = x.index[0]
x_val = x[year]
if unp.isnan(x_val.m):
historic_ei_dict = { d['year']:d['value'] for d in df_company_data.loc[x.name].historic_data['emissions_intensities']['S1S2']}
return historic_ei_dict[year]
else:
return x_val

projected_targets = self.company_data.get_company_projected_targets(company_ids)
# Fill in ragged left edge of projected_targets with historic data, interpolating where we need to
projected_targets[projected_targets.columns[0]] = (
projected_targets[[projected_targets.columns[0]]].apply(fix_ragged_projected_targets, axis=1)
)
df_target = self._get_cumulative_emissions(
projected_ei=projected_targets,
projected_production=projected_production).rename(self.column_config.CUMULATIVE_TARGET)
Expand Down Expand Up @@ -134,7 +150,15 @@ def _get_cumulative_emissions(self, projected_ei: pd.DataFrame, projected_produc
:return: cumulative emissions based on weighted sum of emissions intensity * production
"""
projected_emissions = projected_ei.multiply(projected_production)
projected_emissions = projected_emissions.applymap(lambda x: np.nan if unp.isnan(x) else x)
null_idx = projected_emissions.index[projected_emissions.isnull().all(axis=1)]
return pd.concat([projected_emissions.loc[null_idx, projected_emissions.columns[0]],
return projected_emissions.sum(axis=1).astype('pint[Mt CO2]')

# The following code is broken, due to the way unp.isnan straps away Quantity from scalars
# It was written to rescue data from automotive, but maybe not needed anymore?
nan_emissions = projected_emissions.applymap(lambda x: np.nan if unp.isnan(x) else x)
if nan_emissions.isnull().any(axis=0).any():
breakpoint()
null_idx = nan_emissions.index[nan_emissions.isnull().all(axis=1)]
# FIXME: this replaces the quantified NaNs in projected_emissions with straight-up NaNs,
# while also converting the remaining emissions to a consistent unit of 'Mt CO2'
return pd.concat([nan_emissions.loc[null_idx, nan_emissions.columns[0]],
projected_emissions.loc[projected_emissions.index.difference(null_idx)].sum(axis=1)]).astype('pint[Mt CO2]')
40 changes: 28 additions & 12 deletions ITR/data/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,16 @@ def _estimated_value(y: pd.Series) -> pint.Quantity:
breakpoint()
raise ValueError
# This relies on the fact that we can now see Quantity(np.nan, ...) for both float and ufloat magnitudes
x = y[~y.map(unp.isnan)]
x = PA_._from_sequence(y)
xq = x.quantity
xm = xq.m
x = y[~unp.isnan(PA_._from_sequence(y).quantity.m)]
except TypeError:
logger.error(f"type_error({y}) returning {y.values}[0]")
breakpoint()
x = PA_._from_sequence(y)
xq = x.quantity
xm = xq.m
return y.iloc[0]
if len(x) == 0:
# If all inputs are NaN, return the first NaN
Expand All @@ -84,17 +91,11 @@ def _estimated_value(y: pd.Series) -> pint.Quantity:
# If there's only one non-NaN input, return that one
return x.iloc[0]
if isinstance(x.values[0], pint.Quantity):
from ITR.utils import umean
values = x.values
units = values[0].u
assert all([v.u==units for v in values])
values = np.array(list(map(lambda v: v.m if isinstance(v.m, UFloat) else ufloat(v.m, 0), values)))
epsilon = unp.nominal_values(values).mean()/(2e13)
wavg = ufloat(sum([v.n/(v.s**2+epsilon) for v in values])/sum([1/(v.s**2+epsilon) for v in values]),
np.sqrt(len(values)/sum([1/(v.s**2+epsilon) for v in values])))
if wavg.s <= np.sqrt(2*epsilon):
if wavg.s > epsilon:
logger.warning(f"Casting out small uncertainty {wavg.s} from {wavg}; epsilon = {epsilon}.")
wavg = wavg.n
wavg = umean(values)
est = Q_(wavg, units)
else:
logger.error(f"non-qty: _estimated_values called on non-Quantity {x.values[0]};;;")
Expand Down Expand Up @@ -284,12 +285,12 @@ def _fixup_name(x):
df3 = df2.reset_index().set_index(['company_id', 'variable', 'scope'])
df3 = pd.concat([df3.xs(VariablesConfig.PRODUCTIONS, level=1, drop_level=False)
.apply(
lambda x: x.map(lambda y: Q_(y if y is not pd.NA else np.nan,
lambda x: x.map(lambda y: Q_(float(y) if y is not pd.NA else np.nan,
df_fundamentals.loc[df_fundamentals.company_id == x.name[0],
'production_metric'].squeeze())), axis=1),
df3.xs(VariablesConfig.EMISSIONS, level=1, drop_level=False)
.apply(lambda x: x.map(
lambda y: Q_(y if y is not pd.NA else np.nan,
lambda y: Q_(float(y) if y is not pd.NA else np.nan,
df_fundamentals.loc[df_fundamentals.company_id == x.name[0],
'emissions_metric'].squeeze())), axis=1)])
df4 = df3.xs(VariablesConfig.EMISSIONS, level=1) / df3.xs((VariablesConfig.PRODUCTIONS, 'production'),
Expand All @@ -306,7 +307,7 @@ def _fixup_name(x):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for col in esg_year_columns:
qty_col = df_esg[esg_has_units].apply(lambda x: Q_(np.nan if pd.isna(x[col]) else x[col], x['unit']), axis=1)
qty_col = df_esg[esg_has_units].apply(lambda x: Q_(np.nan if pd.isna(x[col]) else float(x[col]), x['unit']), axis=1)
df_esg[col] = df_esg[col].astype('object')
df_esg.loc[df_esg[esg_has_units].index, col] = qty_col
# FIXME: ...but we have to spill our units back to the corp table for now
Expand Down Expand Up @@ -472,6 +473,21 @@ def _company_df_to_model(self, df_fundamentals: pd.DataFrame,
# TODO pull ghg_s1s2 and ghg_s3 from historic data as appropriate

if df_historic_data is not None:
# FIXME: Is this the best place to finalize base_year_production, ghg_s1s2, and ghg_s3 data?
# Something tells me these parameters should be removed in favor of querying historical data directly
if not ColumnsConfig.BASE_YEAR_PRODUCTION in company_data:
company_data[ColumnsConfig.BASE_YEAR_PRODUCTION] = df_historic_data.loc[
company_data[ColumnsConfig.COMPANY_ID], 'Productions', 'production'][
TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
if not ColumnsConfig.GHG_SCOPE12 in company_data:
company_data[ColumnsConfig.GHG_SCOPE12] = df_historic_data.loc[
company_data[ColumnsConfig.COMPANY_ID], 'Emissions', 'S1S2'][
TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
if not ColumnsConfig.GHG_SCOPE3 in company_data:
company_data[ColumnsConfig.GHG_SCOPE3] = df_historic_data.loc[
company_data[ColumnsConfig.COMPANY_ID], 'Emissions', 'S3'][
TemperatureScoreConfig.CONTROLS_CONFIG.base_year]

company_data[ColumnsConfig.HISTORIC_DATA] = self._convert_historic_data(
df_historic_data.loc[[company_data[ColumnsConfig.COMPANY_ID]]].reset_index()).dict()
else:
Expand Down
25 changes: 15 additions & 10 deletions ITR/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,12 @@ def __get_validators__(cls):
@classmethod
def validate(cls, value):
if isinstance(value, str):
v, u = value.split(' ', 1)
try:
q = Q_(float(v), u)
q = Q_(value)
except ValueError:
raise ValueError(f"cannot convert '{quantity}' to quantity")
quantity = q
elif isinstance(value, Quantity):
elif isinstance(value, pint.Quantity):
quantity = value
else:
raise TypeError (f"quantity takes either a Q_ value or a string fully expressing a quantified value; got {value}")
Expand Down Expand Up @@ -652,7 +651,8 @@ class ICompanyEIProjection(BaseModel):

def add(self, o):
assert self.year==o.year
return IEIRealization(year=self.year, value = self.value + o.value)
return IEIRealization(year=self.year,
value = self.value + 0 if unp.isnan(o.value.m) else o.value)


class ICompanyEIProjections(BaseModel):
Expand Down Expand Up @@ -690,7 +690,8 @@ class IEmissionRealization(BaseModel):

def add(self, o):
assert self.year==o.year
return IEmissionRealization(year=self.year, value = self.value + o.value)
return IEmissionRealization(year=self.year,
value = self.value + 0 if unp.isnan(o.value.m) else o.value)


class IHistoricEmissionsScopes(BaseModel):
Expand All @@ -707,7 +708,8 @@ class IEIRealization(BaseModel):

def add(self, o):
assert self.year==o.year
return IEIRealization(year=self.year, value = self.value + o.value)
return IEIRealization(year=self.year,
value = self.value + 0 if unp.isnan(o.value.m) else o.value)


class IHistoricEIScopes(BaseModel):
Expand Down Expand Up @@ -812,7 +814,7 @@ def _sector_to_production_units(self, sector, region="Global"):
return units

def _get_base_realization_from_historic(self, realized_values: List[BaseModel], units, base_year=None):
valid_realizations = [rv for rv in realized_values if rv.value is not None and not unp.isnan(rv.value)]
valid_realizations = [rv for rv in realized_values if rv.value is not None and not unp.isnan(rv.value.magnitude)]
if not valid_realizations:
retval = realized_values[0].copy()
retval.year = None
Expand All @@ -821,6 +823,7 @@ def _get_base_realization_from_historic(self, realized_values: List[BaseModel],
if base_year and valid_realizations[0].year != base_year:
retval = realized_values[0].copy()
retval.year = base_year
# FIXME: Unless and until we accept uncertainties as input, rather than computed data, we don't need to make this a UFloat here
retval.value = Q_(np.nan, units)
return retval
return valid_realizations[0]
Expand All @@ -847,14 +850,16 @@ def __init__(self, emissions_metric=None, production_metric=None, base_year_prod
self.emissions_metric = EmissionsMetric('t CO2')
# TODO: Should raise a warning here
base_year = None
if self.base_year_production:
pass
# Right now historic_data comes in via template.py ESG data
if self.historic_data and self.historic_data.productions:
elif self.historic_data and self.historic_data.productions:
# TODO: This is a hack to get things going.
base_realization = self._get_base_realization_from_historic(self.historic_data.productions, str(self.production_metric), base_year)
base_year = base_realization.year
self.base_year_production = base_realization.value
else:
# raise ValueError(f"missing historic data for base_year_production for {self.company_name}")
raise ValueError(f"missing historic data for base_year_production for {self.company_name}")
self.base_year_production = Q_(np.nan, str(self.production_metric))
if self.ghg_s1s2 is None and self.historic_data and self.historic_data.emissions:
if self.historic_data.emissions.S1S2:
Expand Down Expand Up @@ -902,10 +907,10 @@ class ICompanyAggregates(ICompanyData):
benchmark_temperature: quantity('delta_degC')
benchmark_global_budget: EmissionsQuantity

# projected_production is computed but never saved, so computed at least 2x: initialiation/projection and cumulative budget
# projected_targets: Optional[ICompanyEIProjectionsScopes]
# projected_intensities: Optional[ICompanyEIProjectionsScopes]


class TemperatureScoreControls(BaseModel):
base_year: int
target_end_year: int
Expand Down
4 changes: 0 additions & 4 deletions ITR/portfolio_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
import pint
import pint_pandas

ureg = pint.get_application_registry()
Q_ = ureg.Quantity
PA_ = pint_pandas.PintArray

from .configs import PortfolioAggregationConfig, ColumnsConfig, logger
from .interfaces import EScope

Expand Down
20 changes: 13 additions & 7 deletions ITR/temperature_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def get_score(self, scorable_row: pd.Series) -> Tuple[

# If only target data missing assign only trajectory_score to final score
elif unp.isnan(scorable_row[self.c.COLS.CUMULATIVE_TARGET]) or scorable_row[self.c.COLS.CUMULATIVE_TARGET] == 0:
breakpoint()
target_overshoot_ratio = np.nan
target_temperature_score = np.nan
trajectory_overshoot_ratio = scorable_row[self.c.COLS.CUMULATIVE_TRAJECTORY] / scorable_row[
Expand Down Expand Up @@ -155,8 +156,7 @@ def _prepare_data(self, data: pd.DataFrame):
lambda row: self.get_score(row), axis=1))

# Fix up dtypes for the new columns we just added
for c in [self.c.COLS.TEMPERATURE_SCORE, self.c.COLS.TRAJECTORY_SCORE, self.c.COLS.TRAJECTORY_SCORE,
self.c.COLS.TARGET_SCORE]:
for c in [self.c.COLS.TEMPERATURE_SCORE, self.c.COLS.TRAJECTORY_SCORE, self.c.COLS.TARGET_SCORE]:
scoring_data[c] = scoring_data[c].astype('pint[delta_degC]')

scoring_data = self.cap_scores(scoring_data)
Expand All @@ -169,11 +169,14 @@ def _calculate_company_score(self, data):
:param data: The original data set as a pandas data frame
:return: The data frame, with an updated s1s2s3 temperature score
"""
# Calculate the GHC
company_data = data[
from ITR.utils import umean

# Calculate the GHC--using umean to deal with uncertainties
# FIXME: what about median vs. mean?
company_data = umean(data[
[self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE, self.c.COLS.GHG_SCOPE12,
self.c.COLS.GHG_SCOPE3, self.c.COLS.TEMPERATURE_SCORE, self.c.SCORE_RESULT_TYPE]
].groupby([self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE]).mean()
].groupby([self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE]))

with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand Down Expand Up @@ -230,15 +233,18 @@ def _get_aggregations(self, data: pd.DataFrame, total_companies: int) -> Tuple[A
data[self.c.COLS.CONTRIBUTION] = weighted_scores
with warnings.catch_warnings():
warnings.simplefilter("ignore")
contributions = data \
data_contributions = data[['company_name', 'company_id', 'temperature_score', 'contribution_relative', 'contribution']] \
.sort_values(self.c.COLS.CONTRIBUTION_RELATIVE, ascending=False) \
.where(pd.notnull(data), 0) \
.to_dict(orient="records")
contribution_dicts = [{k:v if isinstance(v, str) else str(v)
for k,v in contribution.items()}
for contribution in data_contributions]
aggregations = (Aggregation(
score=weighted_scores.sum(),
# proportion is not declared by anything to be a percent, so we make it a number from 0..1
proportion=len(weighted_scores) / total_companies,
contributions=[AggregationContribution.parse_obj(contribution) for contribution in contributions]), \
contributions=[AggregationContribution.parse_obj(contribution) for contribution in contribution_dicts]), \
data[self.c.COLS.CONTRIBUTION_RELATIVE], \
data[self.c.COLS.CONTRIBUTION])

Expand Down
25 changes: 25 additions & 0 deletions ITR/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from __future__ import annotations

import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Optional, Tuple
from ITR.data.osc_units import ureg
import pint
from uncertainties import unumpy as unp
from uncertainties import ufloat, UFloat

from .interfaces import PortfolioCompany, EScope, ETimeFrames, ScoreAggregations, TemperatureScoreControls
from .configs import ColumnsConfig, TemperatureScoreConfig, LoggingConfig, logger
Expand Down Expand Up @@ -127,3 +130,25 @@ def calculate(portfolio_data: pd.DataFrame, fallback_score: pint.Quantity['delta
scores = ts.anonymize_data_dump(scores)

return scores, aggregations


# https://stackoverflow.com/a/74137209/1291237
def umean(quantified_data):
"""
Assuming Gaussian statistics, uncertainties stem from Gaussian parent distributions. In such a case,
it is standard to weight the measurements (nominal values) by the inverse variance.
This function uses error propagation on the to get an uncertainty of the weighted average.
:param: A set of uncertainty values
:return: The weighted mean of the values, with a freshly calculated error term
"""
values = np.array(list(map(lambda v: v.m if isinstance(v.m, UFloat) else ufloat(v.m, 0), quantified_data)))
epsilon = unp.nominal_values(values).mean()/(2e13)
wavg = ufloat(sum([v.n/(v.s**2+epsilon) for v in values])/sum([1/(v.s**2+epsilon) for v in values]),
np.sqrt(len(values)/sum([1/(v.s**2+epsilon) for v in values])))
if wavg.s <= np.sqrt(2*epsilon):
if wavg.s > epsilon:
logger.debug(f"Casting out small uncertainty {wavg.s} from {wavg}; epsilon = {epsilon}.")
wavg = wavg.n

return wavg
Loading

0 comments on commit 50b6f3a

Please sign in to comment.