canva-public
diff --git a/‎.github/workflows/check_version.yml
+47 b/‎.github/workflows/check_version.yml
+47
diff --git a/‎.github/workflows/publish.yml
+37 b/‎.github/workflows/publish.yml
+37
diff --git a/‎.github/workflows/test.yml
+52 b/‎.github/workflows/test.yml
+52
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎docs/model_selection_syntax.md
+59 b/‎docs/model_selection_syntax.md
+59
diff --git a/‎examples/main_step_1_direct.py
+5 b/‎examples/main_step_1_direct.py
+5
diff --git a/‎examples/readme.md
+38-1 b/‎examples/readme.md
+38-1
diff --git a/‎py_package/dbt_column_lineage_extractor/cli_direct.py
+76-28 b/‎py_package/dbt_column_lineage_extractor/cli_direct.py
+76-28
@@ -0,0 +1,47 @@
+name: Check Version Update
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'py_package/setup.py'
+
+jobs:
+  check-version:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout PR branch
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+
+    - name: Extract PR version
+      id: pr_version
+      run: |
+        PR_VERSION=$(grep -oP "version='\K[^']+" py_package/setup.py)
+        echo "pr_version=$PR_VERSION" >> $GITHUB_OUTPUT
+        echo "PR Version: $PR_VERSION"
+
+    - name: Extract main version
+      id: main_version
+      run: |
+        git fetch origin main
+        MAIN_VERSION=$(git show origin/main:py_package/setup.py | grep -oP "version='\K[^']+")
+        echo "main_version=$MAIN_VERSION" >> $GITHUB_OUTPUT
+        echo "Main Version: $MAIN_VERSION"
+
+    - name: Compare versions
+      id: compare_versions
+      run: |
+        if [ "${{ steps.pr_version.outputs.pr_version }}" == "${{ steps.main_version.outputs.main_version }}" ]; then
+          echo "::error::Version in setup.py must be updated from '${{ steps.main_version.outputs.main_version }}' when making a PR to main branch"
+          exit 1
+        else
+          echo "Version was updated from '${{ steps.main_version.outputs.main_version }}' to '${{ steps.pr_version.outputs.pr_version }}'"
+        fi 
@@ -0,0 +1,37 @@
+name: Publish Python Package
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'py_package/**'
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./py_package
+
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    
+    - name: Build package
+      run: python setup.py sdist bdist_wheel
+    
+    - name: Publish to PyPI
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+      run: twine upload dist/* 
@@ -0,0 +1,52 @@
+name: Python Tests
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8"]
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install build dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install build setuptools wheel
+
+    - name: Install package dependencies
+      run: |
+        pip install pytest
+        pip install -r py_package/requirements.txt
+
+    - name: Install package in editable mode
+      run: |
+        cd py_package
+        pip install -e .
+        cd ..
+
+    - name: Setup test data
+      run: |
+        mkdir -p tests/test_data/inputs
+        cp -r examples/inputs/* tests/test_data/inputs/
+
+    - name: Verify test data
+      run: |
+        ls -la tests/test_data/inputs/
+        echo "Test data files:"
+        find tests/test_data/inputs/ -type f -exec ls -lh {} \;
+
+    - name: Run tests with pytest and coverage
+      run: |
+        python -m pytest -v tests/ --capture=no
@@ -155,3 +155,5 @@ cython_debug/
 .DS_Store
 lineage_to_direct_children.json
 lineage_to_direct_parents.json
+/MagicMock
+/outputs
@@ -0,0 +1,59 @@
+# Model Selection Syntax
+
+This document explains the dbt-style model selection syntax supported by the DBT Column Lineage Extractor tool.
+
+## Node Types
+- Regular models: `model_name` or `model.package.model_name`
+- Source nodes: `source.schema.table`
+
+## Graph Operators
+- `+model_name` - Include the model and all its ancestors (upstream models)
+- `model_name+` - Include the model and all its descendants (downstream models)
+- `+model_name+` - Include the model, all its ancestors, and all its descendants (entire lineage)
+
+## Set Operators
+- `model1 model2` - Select models that match either selector (union)
+- `model1,model2` - Select models that match both selectors (intersection)
+
+## Resource Selectors
+- `tag:daily` - Select models with the tag "daily"
+- `path:models/finance` - Select models in the specified path
+- `package:marketing` - Select models from the specified package
+
+## Examples
+
+```bash
+# Select all finance models and their dependencies
+dbt_column_lineage_direct --model "+tag:finance"
+
+# Select order models and their downstream dependencies
+dbt_column_lineage_direct --model "orders+"
+
+# Select models that are both daily and in the finance package
+dbt_column_lineage_direct --model "tag:daily,package:finance"
+
+# Select models in the core package or downstream from customers
+dbt_column_lineage_direct --model "package:core customers+"
+
+# Select a specific source
+dbt_column_lineage_direct --model "source.raw.customers"
+
+# Select a source and its downstream dependencies
+dbt_column_lineage_direct --model "source.raw.customers+"
+
+# Select a source and its entire lineage (both upstream and downstream)
+dbt_column_lineage_direct --model "+source.raw.customers+"
+```
+
+You can also specify models from a JSON file using the `--model-list-json` parameter:
+```bash
+dbt_column_lineage_direct --manifest ./inputs/manifest.json --catalog ./inputs/catalog.json --model-list-json ./models.json
+```
+Where `models.json` is a JSON file containing a list of model names:
+```json
+[
+  "model.jaffle_shop.customers",
+  "model.jaffle_shop.orders",
+  "source.raw.customers"
+]
+``` 
@@ -13,6 +13,11 @@
     # "model.dbt_project_1.model_2"
 ]
 
+# Alternative: You can also load models from a JSON file
+# import json
+# with open('models.json', 'r') as f:
+#     li_selected_model = json.load(f)
+
 extractor = DbtColumnLineageExtractor(
     manifest_path=manifest_path,
     catalog_path=catalog_path,
 
@@ -3,7 +3,10 @@
 1. Place your dbt `manifest.json` and `catalog.json` files in the `inputs` directory.
 2. **Customization**:
    - Set your dialect (only tested with `snowflake` so far) in the `main_step_1_direct.py` script.
-   - You can specify the scope of the models you want to extract column lineage for by adding them to the `li_selected_model` list, or leave it empty to process all models (recommended).
+   - You can specify the scope of the models you want to extract column lineage for by adding them to the `li_selected_model` list, or leave it empty to process all models.
+   - When specifying models, you can use dbt-style selectors like `+model_name` (ancestors), `model_name+` (descendants), `+model_name+` (entire lineage), `tag:my_tag` (tag filtering), etc.
+   - Both models and sources are supported in selectors (e.g., `source.schema.table`).
+   - Alternatively, you can create a JSON file with a list of models and use the `--model-list-json` parameter when running the CLI.
 
 3. Run the `main_step_1_direct.py` script to extract direct column lineage:
    ```bash
@@ -14,6 +17,40 @@
    - `lineage_to_direct_parents.json`
    - `lineage_to_direct_children.json`
 
+#### Model Selection with dbt-style Syntax
+
+When specifying models using Python code, you can use dbt-style selectors just like in the CLI:
+
+```python
+# Example model selectors
+li_selected_model = [
+    # Include orders and all its ancestors
+    "+orders",
+    
+    # Include all models with "finance" tag
+    "tag:finance",
+    
+    # Include models that are both daily-tagged AND in the core package
+    "tag:daily,package:core",
+    
+    # Include a specific source
+    "source.raw.customers",
+    
+    # Include a source and all its downstream dependencies
+    "source.raw.orders+",
+    
+    # Get the entire lineage (upstream and downstream) of a source
+    "+source.raw.payments+"
+]
+
+extractor = DbtColumnLineageExtractor(
+    manifest_path="./inputs/manifest.json",
+    catalog_path="./inputs/catalog.json",
+    selected_models=li_selected_model,
+    dialect="snowflake"
+)
+```
+
 #### Analyze Recursive Column Lineage
 
 1. With the output from the direct column lineage step, run the `main_step_2_recursive.py` script to analyze recursive column lineage:
 
@@ -7,44 +7,92 @@ def main():
     parser.add_argument('--manifest', default='./inputs/manifest.json', help='Path to the manifest.json file, default to ./inputs/manifest.json')
     parser.add_argument('--catalog', default='./inputs/catalog.json', help='Path to the catalog.json file, default to ./inputs/catalog.json')
     parser.add_argument('--dialect', default='snowflake', help='SQL dialect to use, default is snowflake, more dialects at https://github.com/tobymao/sqlglot/tree/v25.24.5/sqlglot/dialects')
-    parser.add_argument('--model', nargs='*', default=[], help='List of models to extract lineage for, default to all models')
+    parser.add_argument(
+        '--model', 
+        nargs='*', 
+        default=[], 
+        help='''List of models to extract lineage for using dbt-style selectors:
+            - Simple model names: model_name
+            - Include ancestors: +model_name (include upstream/parent models)
+            - Include descendants: model_name+ (include downstream/child models)
+            - Union (either): "model1 model2" (models matching either selector)
+            - Intersection (both): "model1,model2" (models matching both selectors)
+            - Tag filtering: tag:my_tag (models with specific tag)
+            - Path filtering: path:models/finance (models in specific path)
+            - Package filtering: package:my_package (models in specific package)
+            Default behavior extracts lineage for all models.'''
+    )
+    parser.add_argument('--model-list-json', help='Path to a JSON file containing a list of models to extract lineage for. If specified, this takes precedence over --model')
     parser.add_argument('--output-dir', default='./outputs', help='Directory to write output json files, default to ./outputs')
     parser.add_argument('--show-ui', action='store_true', help='Flag to show lineage outputs in the console')
+    parser.add_argument('--continue-on-error', action='store_true', help='Continue processing even if some models fail')
 
     args = parser.parse_args()
 
-    # utils.clear_screen()
-
-    extractor = DbtColumnLineageExtractor(
-        manifest_path=args.manifest,
-        catalog_path=args.catalog,
-        selected_models=args.model,
-        dialect=args.dialect,
-    )
+    try:
+        selected_models = args.model
+        if args.model_list_json:
+            try:
+                selected_models = utils.read_json(args.model_list_json)
+                if not isinstance(selected_models, list):
+                    raise ValueError("The JSON file must contain a list of model names")
+            except Exception as e:
+                print(f"Error reading model list from JSON file: {e}")
+                return 1
 
-    lineage_map = extractor.build_lineage_map()
-    lineage_to_direct_parents = extractor.get_columns_lineage_from_sqlglot_lineage_map(lineage_map)
-    lineage_to_direct_children = (
-        extractor.get_lineage_to_direct_children_from_lineage_to_direct_parents(
-            lineage_to_direct_parents
+        extractor = DbtColumnLineageExtractor(
+            manifest_path=args.manifest,
+            catalog_path=args.catalog,
+            selected_models=selected_models,
+            dialect=args.dialect,
         )
-    )
 
-    utils.write_dict_to_file(
-        lineage_to_direct_parents, f"{args.output_dir}/lineage_to_direct_parents.json"
-    )
+        print(f"Processing {len(extractor.selected_models)} models after selector expansion")
+        
+        try:
+            lineage_map = extractor.build_lineage_map()
+            
+            if not lineage_map:
+                print("Warning: No valid lineage was generated. Check for errors above.")
+                if not args.continue_on_error:
+                    return 1
+                    
+            lineage_to_direct_parents = extractor.get_columns_lineage_from_sqlglot_lineage_map(lineage_map)
+            lineage_to_direct_children = (
+                extractor.get_lineage_to_direct_children_from_lineage_to_direct_parents(
+                    lineage_to_direct_parents
+                )
+            )
 
-    utils.write_dict_to_file(
-        lineage_to_direct_children, f"{args.output_dir}/lineage_to_direct_children.json"
-    )
+            utils.write_dict_to_file(
+                lineage_to_direct_parents, f"{args.output_dir}/lineage_to_direct_parents.json"
+            )
+
+            utils.write_dict_to_file(
+                lineage_to_direct_children, f"{args.output_dir}/lineage_to_direct_children.json"
+            )
 
-    if args.show_ui:
-        print("===== Lineage to Direct Parents =====")
-        utils.pretty_print_dict(lineage_to_direct_parents)
-        print("===== Lineage to Direct Children =====")
-        utils.pretty_print_dict(lineage_to_direct_children)
+            if args.show_ui:
+                print("===== Lineage to Direct Parents =====")
+                utils.pretty_print_dict(lineage_to_direct_parents)
+                print("===== Lineage to Direct Children =====")
+                utils.pretty_print_dict(lineage_to_direct_children)
 
-    print("Lineage extraction complete. Output files written to output directory.")
+            print("Lineage extraction complete. Output files written to output directory.")
+            return 0
+            
+        except Exception as e:
+            print(f"Error during lineage extraction: {str(e)}")
+            if not args.continue_on_error:
+                raise
+            return 1
+            
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return 1
 
 if __name__ == '__main__':
-    main()
+    import sys
+    sys.exit(main())