iza-institute-of-labor-economics · MImmesberger · Mar 23, 2025 · Dec 19, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -72,3 +72,4 @@ docs/_build
 docs/tutorials/sandbox_cz.ipynb
 _version.py
 .pixi/
+prof/
diff --git a/docs/conf.py b/docs/conf.py
@@ -5,16 +5,16 @@
 # If extensions (or modules to document with autodoc) are in another directory, add
 # these directories to sys.path here. If the directory is relative to the documentation
 # root, use os.path.abspath to make it absolute, like shown here.
-import datetime as dt
 import pathlib
 import sys
+from datetime import datetime
 
 sys.path.insert(0, str(pathlib.Path("../src").resolve()))
 
 # -- Project information -----------------------------------------------------
 
 project = "GETTSIM"
-copyright = f"2019-{dt.datetime.today().year}, GETTSIM team"  # noqa: A001
+copyright = f"2019-{datetime.today().year}, GETTSIM team"  # noqa: A001
 author = "GETTSIM team"
 release = "0.7.0"
 version = ".".join(release.split(".")[:2])

diff --git a/docs/geps/gep-01.md b/docs/geps/gep-01.md
@@ -42,8 +42,9 @@ a nutshell and without explanations, these conventions are:
      Internal variables should be used sparingly.
 
 1. If names need to be concatenated for making clear what a column name refers to (e.g.,
-   `arbeitsl_geld_2_vermög_freib_bg` vs. `grunds_im_alter_vermög_freib_eg`), the group
-   (i.e., the tax or transfer) that a variable refers to appears first.
+   `arbeitslosengeld_2__freibetrag_vermögen_bg` vs.
+   `grundsicherung__im_alter__vermögensfreibetrag_eg`), the group (i.e., the tax or
+   transfer) that a variable refers to appears first.
 
 1. Because of the necessity of concatenated column names, there will be conflicts
    between readability (1.) and variable length (2.). If such conflicts arise, they need
@@ -108,12 +109,13 @@ no restriction on the number of characters. Internal columns should be used spar
 
 Across variations that include the same identifier, this identifier should not be
 changed, even if it leads to long variable names (e.g., `kinderfreib`,
-`_zu_verst_eink_ohne_kinderfreib_y_sn`). This makes searching for identifiers easier and
+`einkommensteuer__gesamteinkommen_y`). This makes searching for identifiers easier and
 less error-prone.
 
 If names need to be concatenated for making clear what a column name refers to (e.g.,
-`arbeitsl_geld_2_vermög_freib_bg` vs. `grunds_im_alter_vermög_freib_eg`), the group
-(i.e., the tax or transfer) that a variable refers to appears first.
+`arbeitslosengeld_2__freibetrag_vermögen_bg` vs.
+`grundsicherung__im_alter__vermögensfreibetrag_eg`), the group (i.e., the tax or
+transfer) that a variable refers to appears first.
 
 If a column has a reference to a time unit (i.e., any flow variable like earnings or
 transfers), a column is indicated by an underscore plus one of {`y`, `m`, `w`, `d`}.
@@ -127,48 +129,49 @@ GETTSIM knows about the following units:
 - `p_id`: person identifier
 - `hh_id`: Haushalt, individuals living together in a household in the Wohngeld sense
   (§5 WoGG).
-- `wthh_id`: Wohngeldrechtlicher Teilhaushalt, i.e. members of a household for whom the
-  priority check for Wohngeld/ALG2 yields the same result ∈ {True, False}. This unit is
-  based on the priority check via `wohngeld_vorrang_bg` and
-  `wohngeld_kinderzuschl_vorrang_bg`.
-- `fg_id`: Familiengemeinschaft. Maximum of two generations, the relevant unit for
-  Bürgergeld / Arbeitslosengeld 2. Another way to think about this is the potential
-  Bedarfsgemeinschaft before making checks for whether children have enough income fend
-  for themselves. Subset of `hh`.
-- `bg_id`: Bedarfsgemeinschaft, i.e., Familiengemeinschaft excluding children who have
-  enough income to fend for themselves (they will form separate `bg`s). Subset of
-  `fg_id`.
-- `eg_id`: Einstandsgemeinschaft, a couple whose members are deemed to be responsible
-  for each other. This includes couples that live together and may or may not be married
-  or in a civil union.
-- `ehe_id`: Ehegemeinschaft, i.e. couples that are married or in a civil union.
-- `sn_id`: Steuernummer (same for spouses filing taxes jointly, not the same as the
-  Germany-wide Steuer-ID)
+- `wohngeld__wthh_id`: Wohngeldrechtlicher Teilhaushalt, i.e. members of a household for
+  whom the priority check for Wohngeld/ALG2 yields the same result ∈ {True, False}. This
+  unit is based on the priority check via
+  `vorrangprüfungen__wohngeld_vorrang_vor_arbeitslosengeld_2_bg` and
+  `vorrangprüfungen__wohngeld_und_kinderzuschlag_vorrang_vor_arbeitslosengeld_2_bg`.
+- `arbeitslosengeld_2__fg_id`: Familiengemeinschaft. Maximum of two generations, the
+  relevant unit for Bürgergeld / Arbeitslosengeld 2. Another way to think about this is
+  the potential Bedarfsgemeinschaft before making checks for whether children have
+  enough income fend for themselves. Subset of `hh`.
+- `arbeitslosengeld_2__bg_id`: Bedarfsgemeinschaft, i.e., Familiengemeinschaft excluding
+  children who have enough income to fend for themselves (they will form separate
+  `bg`s). Subset of `arbeitslosengeld_2__fg_id`.
+- `arbeitslosengeld_2__eg_id`: Einstandsgemeinschaft, a couple whose members are deemed
+  to be responsible for each other. This includes couples that live together and may or
+  may not be married or in a civil union.
+- `familie__ehe_id`: Ehegemeinschaft, i.e. couples that are married or in a civil union.
+- `einkommensteuer__sn_id`: Steuernummer (same for spouses filing taxes jointly, not the
+  same as the Germany-wide Steuer-ID)
 
 Note that households do not include flat shares etc.. Such broader definition are
 currently not relevant in GETTSIM but may be added in the future (e.g., capping rules
 for costs of dwelling in SGB II depend on this).
 
 Open questions:
 
-- Can we use bg_id for both SGB II and SGB XII at the same time or do we need to
-  differentiate once we add serious support for SGB XII?
+- Can we use `arbeitslosengeld_2__bg_id` for both SGB II and SGB XII at the same time or
+  do we need to differentiate once we add serious support for SGB XII?
 
 Time unit identifiers always appear before unit identifiers (e.g.,
-`arbeitsl_geld_2_m_bg`).
+`arbeitslosengeld_2__betrag_m_bg`).
 
 ## Parameters of the taxes and transfers system
 
 The structure of these parameters are laid out in \<GEP-3 `gep-3`>; we just note some
 general naming considerations here.
 
 - There is a hierarchical structure to these parameters in that each of them is
-  associated with a group (e.g., `arbeitsl_geld`, `kinderzuschlag`). These groups or
+  associated with a group (e.g., `arbeitslosengeld`, `kinderzuschlag`). These groups or
   abbreviations thereof do not re-appear in the name of the parameter.
 - Parameter names should be generally be aligned with relevant column names. However,
   since the group is not repeated for the parameter, it is often better not to
   abbreviate them (e.g., `wohngeld_params["vermögensgrundfreibetrag"]` for the parameter
-  and `wohngeld_anspruchshöhe_m_wthh` for a column derived from it).
+  and `wohngeld__anspruchshöhe_m_wthh` for a column derived from it).
 
 ## Other Python identifiers (Functions, Variables)
 

diff --git a/docs/geps/gep-03.md b/docs/geps/gep-03.md
@@ -333,15 +333,15 @@ The following goes through the details using an example from the basic pension a
 (Grundrente).
 
 The law on the public pension insurance specifies that the maximum possible
-Grundrentenzuschlag `grundr_zuschlag_höchstwert_m` be rounded to the nearest fourth
-decimal point (§76g SGB VI: Zuschlag an Entgeltpunkten für langjährige Versicherung).
-The example below contains GETTSIM's encoding of this fact.
+Grundrentenzuschlag `sozialversicherung__rente__grundrente__höchstbetrag_m` be rounded
+to the nearest fourth decimal point (§76g SGB VI: Zuschlag an Entgeltpunkten für
+langjährige Versicherung). The example below contains GETTSIM's encoding of this fact.
 
 The snippet is taken from `ges_rente.yaml`, which contains the following code:
 
 ```yaml
 rounding:
-  grundr_zuschlag_höchstwert_m:
+  höchstbetrag_m:
     2020-01-01:
       base: 0.0001
       direction: nearest
@@ -393,7 +393,7 @@ necessary inside the functions. The important changes include:
 - Parameters for piecewise polynomials are parsed.
 - Parameters that are derived from other parameters are calculated (examples include
   `kinderzuschlag_max` starting in 2021 or calculating the phasing in of
-  `vorsorgeaufw_alter` over the 2005-2025 period).
+  `vorsorgeaufwendungen_alter` over the 2005-2025 period).
 
 These functions will be avaiable to users en bloque or one-by-one so they can specify
 parameters as in the YAML file for their own policy parameters.

diff --git a/docs/geps/gep-04.md b/docs/geps/gep-04.md
@@ -62,8 +62,10 @@ GETTSIM; this is irrelevant for the DAG.
 
 Function arguments can be of three kinds:
 
-- User-provided input variables (e.g., `bruttolohn_m`).
-- Outputs of other functions in the taxes and transfers system (e.g., `eink_st_y_sn`).
+- User-provided input variables (e.g.,
+  `einkommensteuer__einkünfte__aus_nichtselbstständiger_arbeit__bruttolohn_m`).
+- Outputs of other functions in the taxes and transfers system (e.g.,
+  `einkommensteuer__betrag_y_sn`).
 - Parameters of the taxes and transfers system, which are pre-defined and always end in
   `_params` (e.g., `ges_rentenv_params`).
 
@@ -77,47 +79,59 @@ why we use functions when programming: readability, simplicity, lower maintenanc
 potential entry point for a researcher to change the taxes and transfers system if she
 is able to replace this function with her own version.
 
-See the following example for capital income taxes.
+See the following example for capital income taxes (Abgeltungssteuer).
 
 ```python
-def abgelt_st_y_sn(zu_verst_kapitaleink_y_sn: float, abgelt_st_params: dict) -> float:
+def einkommensteuer__abgeltungssteuer__betrag_y_sn(
+    einkommensteuer__einkünfte__aus_kapitalvermögen__betrag_y_sn: float,
+    abgelt_st_params: dict,
+) -> float:
     """Calculate Abgeltungssteuer on Steuernummer-level.
 
     Parameters
     ----------
-    zu_verst_kapitaleink_y_sn
-        See :func:`zu_verst_kapitaleink_y_sn`.
+    einkommensteuer__einkünfte__aus_kapitalvermögen__betrag_y_sn
+        See :func:`einkommensteuer__einkünfte__aus_kapitalvermögen__betrag_y_sn`.
     abgelt_st_params
         See params documentation :ref:`abgelt_st_params <abgelt_st_params>`.
 
     Returns
     -------
 
     """
-    return abgelt_st_params["satz"] * zu_verst_kapitaleink_y_sn
+    return (
+        abgelt_st_params["satz"]
+        * einkommensteuer__einkünfte__aus_kapitalvermögen__betrag_y_sn
+    )
 ```
 
-The function {func}`abgelt_st_y_sn` requires the variable `zu_verst_kapital_eink_y_sn`,
-which is the amount of taxable capital income on the Steuernummer-level (the latter is
-implied by the `_sn` suffix, see {ref}`gep-1`). `zu_verst_kapital_eink_y_sn` must be
-provided by the user as a column of the input data or it has to be the name of another
-function. It is also possible to specify `zu_verst_kapital_eink_y` and aggregation to
-the `sn`-level will happen automatically. `abgelt_st_params` is a dictionary of
-parameters related to the calculation of `abgelt_st_y_sn`.
+The function `einkommensteuer__abgeltungssteuer__betrag_y_sn` requires the variable
+`einkommensteuer__einkünfte__aus_kapitalvermögen__betrag_y_sn`, which is the amount of
+taxable capital income on the Steuernummer-level (the latter is implied by the `_sn`
+suffix, see {ref}`gep-1`).
+`einkommensteuer__einkünfte__aus_kapitalvermögen__betrag_y_sn` must be provided by the
+user as a column of the input data or it has to be the name of another function.
+`abgelt_st_params` is a dictionary of parameters related to the calculation of
+`betrag_y_sn`.
+
+> Note: In the source code, the prefix `einkommensteuer__abgeltungssteuer__` is missing.
+> This is because it is inferred from the path the function is defined in. For more
+> details, see {ref}`gep-6`.
 
 Another function, say
 
 ```python
-def soli_st_y_sn(
-    eink_st_mit_kinderfreib_y_sn: float,
-    anz_personen_sn: int,
-    abgelt_st_y_sn: float,
+def solidaritätszuschlag__betrag_y_sn(
+    einkommensteuer__betrag_mit_kinderfreibetrag_y_sn: float,
+    einkommensteuer__anzahl_personen_sn: int,
+    einkommensteuer__abgeltungssteuer__betrag_y_sn: float,
     soli_st_params: dict,
 ) -> float: ...
 ```
 
-may use `abgelt_st_y_sn` as an input argument. The DAG backend ensures that the function
-`abgelt_st_y_sn` will be executed first.
+may use `einkommensteuer__abgeltungssteuer__betrag_y_sn` as an input argument. The DAG
+backend ensures that the function `einkommensteuer__abgeltungssteuer__betrag_y_sn` will
+be executed first.
 
 Note that the type annotations (e.g. `float`) indicate the expected type of each input
 and the output of a function, see {ref}`gep-2`.
@@ -216,21 +230,26 @@ data. This section describes how to specify them.
 
 In order to inject aggregation functions at the group level into the graph, scripts with
 functions of the taxes and transfer system should define a dictionary
-`aggregate_by_group_[script_name]` at the module level. This dictionary must specify the
-aggregated columns as keys and a dictionary with keys `source_col` and `aggr` as values.
-If `aggr` is `count`, `source_col` is not needed.
+`aggregation_specs` at the module level. This dictionary must specify the aggregated
+columns as keys and the AggregateByGroupSpec data class as values. The data class
+specifies the `source` (i.e. the column which is being aggregated) and the aggregation
+method `aggr`.
 
-For example, in `demographic_vars.py`, we could have:
+For example, in `household_characteristics.py`, we could have:
 
 ```
-aggregate_by_group_demographic_vars = {
-    "anz_kinder_hh": {"source_col": "kind", "aggr": "sum"},
-    "anz_personen_hh": {"aggr": "count"},
+from _gettsim.aggregation import AggregateByGroupSpec
+
+aggregation_specs = {
+    "anzahl_kinder_hh": AggregateByGroupSpec(source="familie__kind", aggr="sum"),
+    "anzahl_personen_hh": AggregateByGroupSpec(aggr="count"),
 }
 ```
 
-The group identifier (`hh_id`, `wthh_id`, `fg_id`, `bg_id`, `eg_id`, `ehe_id`, `sn_id`)
-will be automatically included as an argument; for `count` nothing else is necessary.
+The group identifier (`hh_id`, `wohngeld__wthh_id`, `arbeitslosengeld_2__fg_id`,
+`arbeitslosengeld_2__bg_id`, `arbeitslosengeld_2__eg_id`, `familie__ehe_id`,
+`einkommensteuer__sn_id`) will be automatically included as an argument; for `count`
+nothing else is necessary.
 
 The output type will be the same as the input type. Exceptions:
 
@@ -249,29 +268,33 @@ Automatic summation will only happen in case no column `my_col_hh` is explicitly
 Using a different reduction function than the sum is as easy as explicitly specifying
 `my_col_hh`.
 
-Consider the following example: the function `kindergeld_m` calculates the
-individual-level child benefit payment. `arbeitsl_geld_2_m_bg` calculates
+Consider the following example: the function `kindergeld__betrag_m` calculates the
+individual-level child benefit payment. `arbeitslosengeld_2__betrag_m_bg` calculates
 Arbeitslosengeld 2 on the Bedarfsgemeinschaft (bg) level (as indicated by the suffix).
 One necessary input of this function is the sum of all child benefits on the
-Bedarfsgemeinschaft level. There is no function or input column `kindergeld_m_bg`.
+Bedarfsgemeinschaft level. There is no function or input column
+`kindergeld__betrag_m_bg`.
 
-By including `kindergeld_m_bg` as an argument in the definition of
-`arbeitsl_geld_2_m_bg` as follows:
+By including `kindergeld__betrag_m_bg` as an argument in the definition of
+`arbeitslosengeld_2__betrag_m_bg` as follows:
 
 ```python
-def arbeitsl_geld_2_m_bg(kindergeld_m_bg, other_arguments): ...
+def arbeitslosengeld_2__betrag_m_bg(kindergeld__betrag_m_bg, other_arguments): ...
 ```
 
-a node `kindergeld_m_bg` containing the Bedarfsgemeinschaft-level sum of `kindergeld_m`
-will be automatically added to the graph. Its parents in the graph will be
-`kindergeld_m` and `bg_id`. This is the same as specifying:
+a node `kindergeld__betrag_m_bg` containing the Bedarfsgemeinschaft-level sum of
+`kindergeld__betrag_m` will be automatically added to the graph. Its parents in the
+graph will be `kindergeld__betrag_m` and `arbeitslosengeld_2__bg_id`. This is the same
+as specifying:
 
 ```
-aggregate_by_group_kindergeld =  = {
-    "kindergeld_m_bg": {
-        "source_col": "kindergeld_m",
-        "aggr": "sum"
-    }
+from _gettsim.aggregation import AggregateByGroupSpec
+
+aggregation_specs = {
+    "kindergeld__betrag_m_bg": AggregateByGroupSpec(
+        source="kindergeld__betrag_m",
+        aggr="sum"
+    )
 }
 ```
 
@@ -287,31 +310,31 @@ column. This section describes how to specify such taxes and transfers.
 
 The implementation is similar to aggregations to the level of groupings: In order to
 specify new aggregation functions, scripts with functions of the taxes and transfer
-system should define a dictionary `aggregate_by_p_id_[script_name]` at the module level.
-This dictionary must specify the aggregated columns as keys and a dictionary with keys
-`source_col`, `p_id_to_aggregate_by` and `aggr` as values. If `aggr` is `count`,
-`source_col` is not needed.
+system should define a dictionary `aggregation_specs` at the module level. This
+dictionary must specify the aggregated columns as keys and the `AggregateByPIDSpec` data
+class as values. The class specifies the `source`, `p_id_to_aggregate_by`, and `aggr`.
+If `aggr` is `count`, `source` is not needed.
 
-The key `source_col` specifies which column is the source of the aggregation operation.
-The key `p_id_to_aggregate_by` specifies the column that indicates to which `p_id` the
-values in `source_col` should be ascribed to. The key `aggr` gives the aggregation
-method.
+The key `source` specifies which column is the source of the aggregation operation. The
+key `p_id_to_aggregate_by` specifies the column that indicates to which `p_id` the
+values in `source` should be ascribed to. The key `aggr` gives the aggregation method.
 
 For example, in `kindergeld.py`, we could have:
 
 ```
-aggregate_by_p_id_kindergeld = {
-    "kindergeld_anz_ansprüche": {
-        "p_id_to_aggregate_by": "p_id_kindergeld_empf",
-        "source_col": "kindergeld_anspruch",
-        "aggr": "sum",
-    },
+aggregation_specs = {
+    "kindergeld__anzahl_ansprüche": AggregateByPIDSpec(
+        p_id_to_aggregate_by="kindergeld__p_id_empfänger",
+        source="kindergeld__grundsätzlich_anspruchsberechtigt",
+        aggr="sum",
+    ),
 }
 ```
 
-This dict creates a target function `kindergeld_anz_ansprüche` which gives the amount of
-claims that a person has on Kindergeld, based on the `kindergeld_anspruch` function
-which returns Booleans, which show whether a child is a reason for a Kindergeld claim.
+This dict creates a target function `kindergeld__anzahl_ansprüche` which gives the
+amount of claims that a person has on Kindergeld, based on the
+`kindergeld__grundsätzlich_anspruchsberechtigt` function which returns Booleans, which
+show whether a child is a reason for a Kindergeld claim.
 
 The output type will be the same as the input type. Exceptions:
-Original file line number
+Diff line change
@@ Expand Up / @@ -72,3 +72,4 @@ docs/_build @@
     docs/tutorials/sandbox_cz.ipynb
     _version.py
     .pixi/
+    prof/