This completes the first smoke-test of the ITR_UI tool working with uncertaintains.

MichaelTiemannOSC · MichaelTiemannOSC · commit 50b6f3a43bac · 2022-10-23T18:25:44.000-04:00
We do not yet actually plot error bars...that's next.  But we can do all the calculations
the GUI requires, which took some work.

Signed-off-by: MichaelTiemann &lt;72577720+MichaelTiemannOSC@users.noreply.github.com&gt;
diff --git a/ITR/data/__init__.py b/ITR/data/__init__.py
@@ -7,6 +7,9 @@
 from openscm_units import unit_registry
 import re
 
+import numpy as np
+from uncertainties import ufloat
+
 # openscm_units doesn't make it easy to set preprocessors.  This is one way to do it.
 unit_registry.preprocessors=[
     lambda s1: re.sub(r'passenger.km', 'pkm', s1),
@@ -32,3 +35,5 @@
 # FIXME: delay loading of pint_pandas until after we've initialized ourselves
 from pint_pandas import PintType
 PintType.ureg = ureg
+
+_ufloat_nan = ufloat(np.nan, 0.0)
diff --git a/ITR/data/base_providers.py b/ITR/data/base_providers.py
diff --git a/ITR/data/data_warehouse.py b/ITR/data/data_warehouse.py
@@ -13,6 +13,8 @@
 from ITR.data.data_providers import CompanyDataProvider, ProductionBenchmarkDataProvider, IntensityBenchmarkDataProvider
 from ITR.configs import ColumnsConfig, TemperatureScoreConfig, LoggingConfig
 
+import pint
+
 logger = logging.getLogger(__name__)
 LoggingConfig.add_config_to_logger(logger)
 
@@ -44,10 +46,10 @@ def __init__(self, company_data: CompanyDataProvider,
         # After projections have been made, shift S3 data into S1S2.  If we shift before we project,
         # then S3 targets will not be projected correctly.
         for c in self.company_data._companies:
-            if c.ghg_s3:
+            if c.ghg_s3 and not unp.isnan(c.ghg_s3.m):
                 # For Production-centric and energy-only data (except for Cement), convert all S3 numbers to S1 numbers
                 c.ghg_s1s2 = c.ghg_s1s2 + c.ghg_s3
-                c.ghg_s3 = Q_(0, c.ghg_s3.u)
+                c.ghg_s3 = Q_(0.0, c.ghg_s3.u)
             if c.historic_data:
                 if c.historic_data.emissions and c.historic_data.emissions.S3:
                     c.historic_data.emissions.S1S2 = list( map(IEmissionRealization.add, c.historic_data.emissions.S1S2, c.historic_data.emissions.S3) )
@@ -82,12 +84,26 @@ def get_preprocessed_company_data(self, company_ids: List[str]) -> List[ICompany
                 company_info_at_base_year).sort_index()
 
         # trajectories are projected from historic data and we are careful to fill all gaps between historic and projections
+        # FIXME: we just computed ALL company data above into a dataframe.  Why not use that?
         projected_trajectories = self.company_data.get_company_projected_trajectories(company_ids)
         df_trajectory = self._get_cumulative_emissions(
             projected_ei=projected_trajectories,
             projected_production=projected_production).rename(self.column_config.CUMULATIVE_TRAJECTORY)
 
+        def fix_ragged_projected_targets(x):
+            year = x.index[0]
+            x_val = x[year]
+            if unp.isnan(x_val.m):
+                historic_ei_dict = { d['year']:d['value'] for d in df_company_data.loc[x.name].historic_data['emissions_intensities']['S1S2']}
+                return historic_ei_dict[year]
+            else:
+                return x_val
+
         projected_targets = self.company_data.get_company_projected_targets(company_ids)
+        # Fill in ragged left edge of projected_targets with historic data, interpolating where we need to
+        projected_targets[projected_targets.columns[0]] = (
+            projected_targets[[projected_targets.columns[0]]].apply(fix_ragged_projected_targets, axis=1)
+            )
         df_target = self._get_cumulative_emissions(
             projected_ei=projected_targets,
             projected_production=projected_production).rename(self.column_config.CUMULATIVE_TARGET)
@@ -134,7 +150,15 @@ def _get_cumulative_emissions(self, projected_ei: pd.DataFrame, projected_produc
         :return: cumulative emissions based on weighted sum of emissions intensity * production
         """
         projected_emissions = projected_ei.multiply(projected_production)
-        projected_emissions = projected_emissions.applymap(lambda x: np.nan if unp.isnan(x) else x)
-        null_idx = projected_emissions.index[projected_emissions.isnull().all(axis=1)]
-        return pd.concat([projected_emissions.loc[null_idx, projected_emissions.columns[0]],
+        return projected_emissions.sum(axis=1).astype('pint[Mt CO2]')
+
+        # The following code is broken, due to the way unp.isnan straps away Quantity from scalars
+        # It was written to rescue data from automotive, but maybe not needed anymore?
+        nan_emissions = projected_emissions.applymap(lambda x: np.nan if unp.isnan(x) else x)
+        if nan_emissions.isnull().any(axis=0).any():
+            breakpoint()
+        null_idx = nan_emissions.index[nan_emissions.isnull().all(axis=1)]
+        # FIXME: this replaces the quantified NaNs in projected_emissions with straight-up NaNs,
+        # while also converting the remaining emissions to a consistent unit of 'Mt CO2'
+        return pd.concat([nan_emissions.loc[null_idx, nan_emissions.columns[0]],
                           projected_emissions.loc[projected_emissions.index.difference(null_idx)].sum(axis=1)]).astype('pint[Mt CO2]')
diff --git a/ITR/data/template.py b/ITR/data/template.py
@@ -73,9 +73,16 @@ def _estimated_value(y: pd.Series) -> pint.Quantity:
             breakpoint()
             raise ValueError
         # This relies on the fact that we can now see Quantity(np.nan, ...) for both float and ufloat magnitudes
-        x = y[~y.map(unp.isnan)]
+        x = PA_._from_sequence(y)
+        xq = x.quantity
+        xm = xq.m
+        x = y[~unp.isnan(PA_._from_sequence(y).quantity.m)]
     except TypeError:
         logger.error(f"type_error({y}) returning {y.values}[0]")
+        breakpoint()
+        x = PA_._from_sequence(y)
+        xq = x.quantity
+        xm = xq.m
         return y.iloc[0]
     if len(x) == 0:
         # If all inputs are NaN, return the first NaN
@@ -84,17 +91,11 @@ def _estimated_value(y: pd.Series) -> pint.Quantity:
         # If there's only one non-NaN input, return that one
         return x.iloc[0]
     if isinstance(x.values[0], pint.Quantity):
+        from ITR.utils import umean
         values = x.values
         units = values[0].u
         assert all([v.u==units for v in values])
-        values = np.array(list(map(lambda v: v.m if isinstance(v.m, UFloat) else ufloat(v.m, 0),  values)))
-        epsilon = unp.nominal_values(values).mean()/(2e13)
-        wavg = ufloat(sum([v.n/(v.s**2+epsilon) for v in values])/sum([1/(v.s**2+epsilon) for v in values]), 
-                      np.sqrt(len(values)/sum([1/(v.s**2+epsilon) for v in values])))
-        if wavg.s <= np.sqrt(2*epsilon):
-            if wavg.s > epsilon:
-                logger.warning(f"Casting out small uncertainty {wavg.s} from {wavg}; epsilon = {epsilon}.")
-            wavg = wavg.n
+        wavg = umean(values)
         est = Q_(wavg, units)
     else:
         logger.error(f"non-qty: _estimated_values called on non-Quantity {x.values[0]};;;")
@@ -284,12 +285,12 @@ def _fixup_name(x):
             df3 = df2.reset_index().set_index(['company_id', 'variable', 'scope'])
             df3 = pd.concat([df3.xs(VariablesConfig.PRODUCTIONS, level=1, drop_level=False)
                 .apply(
-                lambda x: x.map(lambda y: Q_(y if y is not pd.NA else np.nan,
+                lambda x: x.map(lambda y: Q_(float(y) if y is not pd.NA else np.nan,
                                              df_fundamentals.loc[df_fundamentals.company_id == x.name[0],
                                                                  'production_metric'].squeeze())), axis=1),
                 df3.xs(VariablesConfig.EMISSIONS, level=1, drop_level=False)
                 .apply(lambda x: x.map(
-                    lambda y: Q_(y if y is not pd.NA else np.nan,
+                    lambda y: Q_(float(y) if y is not pd.NA else np.nan,
                                  df_fundamentals.loc[df_fundamentals.company_id == x.name[0],
                                                      'emissions_metric'].squeeze())), axis=1)])
             df4 = df3.xs(VariablesConfig.EMISSIONS, level=1) / df3.xs((VariablesConfig.PRODUCTIONS, 'production'),
@@ -306,7 +307,7 @@ def _fixup_name(x):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 for col in esg_year_columns:
-                    qty_col = df_esg[esg_has_units].apply(lambda x: Q_(np.nan if pd.isna(x[col]) else x[col], x['unit']), axis=1)
+                    qty_col = df_esg[esg_has_units].apply(lambda x: Q_(np.nan if pd.isna(x[col]) else float(x[col]), x['unit']), axis=1)
                     df_esg[col] = df_esg[col].astype('object')
                     df_esg.loc[df_esg[esg_has_units].index, col] = qty_col
             # FIXME: ...but we have to spill our units back to the corp table for now
@@ -472,6 +473,21 @@ def _company_df_to_model(self, df_fundamentals: pd.DataFrame,
                 # TODO pull ghg_s1s2 and ghg_s3 from historic data as appropriate
 
                 if df_historic_data is not None:
+                    # FIXME: Is this the best place to finalize base_year_production, ghg_s1s2, and ghg_s3 data?
+                    # Something tells me these parameters should be removed in favor of querying historical data directly
+                    if not ColumnsConfig.BASE_YEAR_PRODUCTION in company_data:
+                        company_data[ColumnsConfig.BASE_YEAR_PRODUCTION] = df_historic_data.loc[
+                            company_data[ColumnsConfig.COMPANY_ID], 'Productions', 'production'][
+                                TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
+                    if not ColumnsConfig.GHG_SCOPE12 in company_data:
+                        company_data[ColumnsConfig.GHG_SCOPE12] = df_historic_data.loc[
+                            company_data[ColumnsConfig.COMPANY_ID], 'Emissions', 'S1S2'][
+                                TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
+                    if not ColumnsConfig.GHG_SCOPE3 in company_data:
+                        company_data[ColumnsConfig.GHG_SCOPE3] = df_historic_data.loc[
+                            company_data[ColumnsConfig.COMPANY_ID], 'Emissions', 'S3'][
+                                TemperatureScoreConfig.CONTROLS_CONFIG.base_year]
+                                
                     company_data[ColumnsConfig.HISTORIC_DATA] = self._convert_historic_data(
                         df_historic_data.loc[[company_data[ColumnsConfig.COMPANY_ID]]].reset_index()).dict()
                 else:
diff --git a/ITR/interfaces.py b/ITR/interfaces.py
@@ -202,13 +202,12 @@ def __get_validators__(cls):
     @classmethod
     def validate(cls, value):
         if isinstance(value, str):
-            v, u = value.split(' ', 1)
             try:
-                q = Q_(float(v), u)
+                q = Q_(value)
             except ValueError:
                 raise ValueError(f"cannot convert '{quantity}' to quantity")
             quantity = q
-        elif isinstance(value, Quantity):
+        elif isinstance(value, pint.Quantity):
             quantity = value
         else:
             raise TypeError (f"quantity takes either a Q_ value or a string fully expressing a quantified value; got {value}")
@@ -652,7 +651,8 @@ class ICompanyEIProjection(BaseModel):
 
     def add(self, o):
         assert self.year==o.year
-        return IEIRealization(year=self.year, value = self.value + o.value)
+        return IEIRealization(year=self.year,
+                              value = self.value + 0 if unp.isnan(o.value.m) else o.value)
 
 
 class ICompanyEIProjections(BaseModel):
@@ -690,7 +690,8 @@ class IEmissionRealization(BaseModel):
 
     def add(self, o):
         assert self.year==o.year
-        return IEmissionRealization(year=self.year, value = self.value + o.value)
+        return IEmissionRealization(year=self.year,
+                                    value = self.value + 0 if unp.isnan(o.value.m) else o.value)
 
 
 class IHistoricEmissionsScopes(BaseModel):
@@ -707,7 +708,8 @@ class IEIRealization(BaseModel):
 
     def add(self, o):
         assert self.year==o.year
-        return IEIRealization(year=self.year, value = self.value + o.value)
+        return IEIRealization(year=self.year,
+                              value = self.value + 0 if unp.isnan(o.value.m) else o.value)
 
 
 class IHistoricEIScopes(BaseModel):
@@ -812,7 +814,7 @@ def _sector_to_production_units(self, sector, region="Global"):
         return units        
 
     def _get_base_realization_from_historic(self, realized_values: List[BaseModel], units, base_year=None):
-        valid_realizations = [rv for rv in realized_values if rv.value is not None and not unp.isnan(rv.value)]
+        valid_realizations = [rv for rv in realized_values if rv.value is not None and not unp.isnan(rv.value.magnitude)]
         if not valid_realizations:
             retval = realized_values[0].copy()
             retval.year = None
@@ -821,6 +823,7 @@ def _get_base_realization_from_historic(self, realized_values: List[BaseModel],
         if base_year and valid_realizations[0].year != base_year:
             retval = realized_values[0].copy()
             retval.year = base_year
+            # FIXME: Unless and until we accept uncertainties as input, rather than computed data, we don't need to make this a UFloat here
             retval.value = Q_(np.nan, units)
             return retval
         return valid_realizations[0]
@@ -847,14 +850,16 @@ def __init__(self, emissions_metric=None, production_metric=None, base_year_prod
                 self.emissions_metric = EmissionsMetric('t CO2')
             # TODO: Should raise a warning here
         base_year = None
+        if self.base_year_production:
+            pass
         # Right now historic_data comes in via template.py ESG data
-        if self.historic_data and self.historic_data.productions:
+        elif self.historic_data and self.historic_data.productions:
             # TODO: This is a hack to get things going.
             base_realization = self._get_base_realization_from_historic(self.historic_data.productions, str(self.production_metric), base_year)
             base_year = base_realization.year
             self.base_year_production = base_realization.value
         else:
-            # raise ValueError(f"missing historic data for base_year_production for {self.company_name}")
+            raise ValueError(f"missing historic data for base_year_production for {self.company_name}")
             self.base_year_production = Q_(np.nan, str(self.production_metric))
         if self.ghg_s1s2 is None and self.historic_data and self.historic_data.emissions:
             if self.historic_data.emissions.S1S2:
@@ -902,10 +907,10 @@ class ICompanyAggregates(ICompanyData):
     benchmark_temperature: quantity('delta_degC')
     benchmark_global_budget: EmissionsQuantity
 
+    # projected_production is computed but never saved, so computed at least 2x: initialiation/projection and cumulative budget
     # projected_targets: Optional[ICompanyEIProjectionsScopes]
     # projected_intensities: Optional[ICompanyEIProjectionsScopes]
 
-
 class TemperatureScoreControls(BaseModel):
     base_year: int
     target_end_year: int
diff --git a/ITR/portfolio_aggregation.py b/ITR/portfolio_aggregation.py
@@ -11,10 +11,6 @@
 import pint
 import pint_pandas
 
-ureg = pint.get_application_registry()
-Q_ = ureg.Quantity
-PA_ = pint_pandas.PintArray
-
 from .configs import PortfolioAggregationConfig, ColumnsConfig, logger
 from .interfaces import EScope
 
diff --git a/ITR/temperature_score.py b/ITR/temperature_score.py
@@ -63,6 +63,7 @@ def get_score(self, scorable_row: pd.Series) -> Tuple[
 
         # If only target data missing assign only trajectory_score to final score
         elif unp.isnan(scorable_row[self.c.COLS.CUMULATIVE_TARGET]) or scorable_row[self.c.COLS.CUMULATIVE_TARGET] == 0:
+            breakpoint()
             target_overshoot_ratio = np.nan
             target_temperature_score = np.nan
             trajectory_overshoot_ratio = scorable_row[self.c.COLS.CUMULATIVE_TRAJECTORY] / scorable_row[
@@ -155,8 +156,7 @@ def _prepare_data(self, data: pd.DataFrame):
                     lambda row: self.get_score(row), axis=1))
 
         # Fix up dtypes for the new columns we just added
-        for c in [self.c.COLS.TEMPERATURE_SCORE, self.c.COLS.TRAJECTORY_SCORE, self.c.COLS.TRAJECTORY_SCORE,
-                  self.c.COLS.TARGET_SCORE]:
+        for c in [self.c.COLS.TEMPERATURE_SCORE, self.c.COLS.TRAJECTORY_SCORE, self.c.COLS.TARGET_SCORE]:
             scoring_data[c] = scoring_data[c].astype('pint[delta_degC]')
 
         scoring_data = self.cap_scores(scoring_data)
@@ -169,11 +169,14 @@ def _calculate_company_score(self, data):
         :param data: The original data set as a pandas data frame
         :return: The data frame, with an updated s1s2s3 temperature score
         """
-        # Calculate the GHC
-        company_data = data[
+        from ITR.utils import umean
+
+        # Calculate the GHC--using umean to deal with uncertainties
+        # FIXME: what about median vs. mean?
+        company_data = umean(data[
             [self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE, self.c.COLS.GHG_SCOPE12,
              self.c.COLS.GHG_SCOPE3, self.c.COLS.TEMPERATURE_SCORE, self.c.SCORE_RESULT_TYPE]
-        ].groupby([self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE]).mean()
+        ].groupby([self.c.COLS.COMPANY_ID, self.c.COLS.TIME_FRAME, self.c.COLS.SCOPE]))
 
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
@@ -230,15 +233,18 @@ def _get_aggregations(self, data: pd.DataFrame, total_companies: int) -> Tuple[A
         data[self.c.COLS.CONTRIBUTION] = weighted_scores
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            contributions = data \
+            data_contributions = data[['company_name', 'company_id', 'temperature_score', 'contribution_relative', 'contribution']] \
                 .sort_values(self.c.COLS.CONTRIBUTION_RELATIVE, ascending=False) \
                 .where(pd.notnull(data), 0) \
                 .to_dict(orient="records")
+        contribution_dicts = [{k:v if isinstance(v, str) else str(v)
+                               for k,v in contribution.items()}
+                              for contribution in data_contributions]
         aggregations = (Aggregation(
             score=weighted_scores.sum(),
             # proportion is not declared by anything to be a percent, so we make it a number from 0..1
             proportion=len(weighted_scores) / total_companies,
-            contributions=[AggregationContribution.parse_obj(contribution) for contribution in contributions]), \
+            contributions=[AggregationContribution.parse_obj(contribution) for contribution in contribution_dicts]), \
                         data[self.c.COLS.CONTRIBUTION_RELATIVE], \
                         data[self.c.COLS.CONTRIBUTION])
 
diff --git a/ITR/utils.py b/ITR/utils.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
 import pandas as pd
+import numpy as np
 from pathlib import Path
 from typing import List, Optional, Tuple
 from ITR.data.osc_units import ureg
 import pint
+from uncertainties import unumpy as unp
+from uncertainties import ufloat, UFloat
 
 from .interfaces import PortfolioCompany, EScope, ETimeFrames, ScoreAggregations, TemperatureScoreControls
 from .configs import ColumnsConfig, TemperatureScoreConfig, LoggingConfig, logger
@@ -127,3 +130,25 @@ def calculate(portfolio_data: pd.DataFrame, fallback_score: pint.Quantity['delta
         scores = ts.anonymize_data_dump(scores)
 
     return scores, aggregations
+
+
+# https://stackoverflow.com/a/74137209/1291237
+def umean(quantified_data):
+    """
+    Assuming Gaussian statistics, uncertainties stem from Gaussian parent distributions. In such a case,
+    it is standard to weight the measurements (nominal values) by the inverse variance.
+
+    This function uses error propagation on the to get an uncertainty of the weighted average.
+    :param: A set of uncertainty values
+    :return: The weighted mean of the values, with a freshly calculated error term
+    """
+    values = np.array(list(map(lambda v: v.m if isinstance(v.m, UFloat) else ufloat(v.m, 0),  quantified_data)))
+    epsilon = unp.nominal_values(values).mean()/(2e13)
+    wavg = ufloat(sum([v.n/(v.s**2+epsilon) for v in values])/sum([1/(v.s**2+epsilon) for v in values]), 
+                  np.sqrt(len(values)/sum([1/(v.s**2+epsilon) for v in values])))
+    if wavg.s <= np.sqrt(2*epsilon):
+        if wavg.s > epsilon:
+            logger.debug(f"Casting out small uncertainty {wavg.s} from {wavg}; epsilon = {epsilon}.")
+        wavg = wavg.n
+
+    return wavg
diff --git a/examples/ITR_UI.py b/examples/ITR_UI.py