canva-public
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/main_step_2_recursive.py
Lines changed: 32 additions & 6 deletions b/‎examples/main_step_2_recursive.py
Lines changed: 32 additions & 6 deletions
diff --git a/‎images/mermaid_example.png
69.5 KB b/‎images/mermaid_example.png
69.5 KB
diff --git a/‎py_package/dbt_column_lineage_extractor/cli_recursive.py
Lines changed: 79 additions & 88 deletions b/‎py_package/dbt_column_lineage_extractor/cli_recursive.py
Lines changed: 79 additions & 88 deletions
diff --git a/‎py_package/dbt_column_lineage_extractor/utils.py
Lines changed: 5 additions & 0 deletions b/‎py_package/dbt_column_lineage_extractor/utils.py
Lines changed: 5 additions & 0 deletions
@@ -157,3 +157,5 @@ lineage_to_direct_children.json
 lineage_to_direct_parents.json
 /MagicMock
 /outputs
+examples/outputs/*.html
+examples/outputs/*.mmd
@@ -1,5 +1,8 @@
 import dbt_column_lineage_extractor.utils as utils
 from dbt_column_lineage_extractor import DbtColumnLineageExtractor
+from dbt_column_lineage_extractor.visualization import create_html_viewer, convert_to_mermaid
+import os
+import webbrowser
 
 utils.clear_screen()
 
@@ -12,13 +15,13 @@
     "./outputs/lineage_to_direct_children.json"
 )
 
-# a source node example
-model = "seed.jaffle_shop.raw_orders"
-column = "id"
+# # a source node example
+# model = "seed.jaffle_shop.raw_orders"
+# column = "id"
 
-# # an intermediate node example
-# model = "model.jaffle_shop.stg_orders"
-# column = "order_id"
+# an intermediate node example
+model = "model.jaffle_shop.stg_orders"
+column = "order_id"
 
 
 print("========================================")
@@ -59,3 +62,26 @@
 print(
     "Or, you can copy the json outputs to tools like https://github.com/AykutSarac/jsoncrack.com, https://jsoncrack.com/editor to visualize the lineage"
 )
+
+# Create visualizations
+output_dir = "./outputs"
+os.makedirs(output_dir, exist_ok=True)
+
+# Convert to Mermaid format and create visualization
+mermaid_output = convert_to_mermaid(model, column, ancestors_structured, descendants_structured)
+
+# Save Mermaid output
+mermaid_file = os.path.join(output_dir, f"{model}_{column}_lineage.mmd")
+with open(mermaid_file, 'w') as f:
+    f.write(mermaid_output)
+
+# Create HTML viewer
+viewer_file = create_html_viewer(mermaid_output, output_dir, model, column)
+
+print("\nVisualization files created:")
+print(f"- Mermaid file: {mermaid_file}")
+print(f"- HTML viewer: {viewer_file}")
+
+# Open the visualization in browser
+print("\nOpening visualization in browser...")
+webbrowser.open(f"file://{os.path.abspath(viewer_file)}")
@@ -1,17 +1,11 @@
 import argparse
 import os
 import json
+import webbrowser
+from pathlib import Path
 import dbt_column_lineage_extractor.utils as utils
 from dbt_column_lineage_extractor import DbtColumnLineageExtractor
-
-def find_model_in_lineage(lineage_data, model_name):
-    """Find full node paths in lineage data that match the model name."""
-    matching_nodes = []
-    for node in lineage_data.keys():
-        # Check if the node ends with the model name
-        if node.split('.')[-1] == model_name:
-            matching_nodes.append(node)
-    return matching_nodes
+from dbt_column_lineage_extractor.visualization import create_html_viewer, convert_to_mermaid
 
 def main():
     parser = argparse.ArgumentParser(description="Recursive DBT Column Lineage Extractor CLI")
@@ -21,12 +15,11 @@ def main():
                         help='Path to the lineage_to_direct_parents.json file, default to ./outputs/lineage_to_direct_parents.json')
     parser.add_argument('--lineage-children-file', default='./outputs/lineage_to_direct_children.json',
                         help='Path to the lineage_to_direct_children.json file, default to ./outputs/lineage_to_direct_children.json')
-    parser.add_argument('--manifest', default='./target/manifest.json', 
-                        help='Path to the dbt manifest.json file, default to ./target/manifest.json')
-    parser.add_argument('--catalog', default='./target/catalog.json',
-                        help='Path to the dbt catalog.json file, default to ./target/catalog.json')
     parser.add_argument('--output-dir', default='./outputs', help='Output directory for lineage files, default to ./outputs')
-    parser.add_argument('--show-ui', action='store_true', help='Show web UI for lineage visualization')
+    parser.add_argument('--no-ui', action='store_true', help='Do not automatically open the visualization in browser')
+    parser.add_argument('--output-format', choices=['json', 'mermaid', 'both'], default='both',
+                        help='Output format for lineage data. Choose between json, mermaid, or both. Default is both.')
+    parser.add_argument('--show-details', action='store_true', help='Show detailed squashed/structured ancestors/descendants in terminal')
 
     args = parser.parse_args()
 
@@ -40,63 +33,41 @@ def main():
             lineage_to_direct_children = utils.read_dict_from_file(args.lineage_children_file)
         except FileNotFoundError as e:
             print(f"Error: Could not find required lineage file: {e}")
+            print("\nTo generate the required lineage files, please run one of the following commands first:")
+            print("\n1. To scan the whole project (takes longer, but you don't need to run it again for different models if there is no model change):")
+            print("   dbt_column_lineage_direct --manifest path/to/manifest.json --catalog path/to/catalog.json")
+            print("\n2. If only interested in this model (faster):")
+            print(f"   dbt_column_lineage_direct --manifest path/to/manifest.json --catalog path/to/catalog.json --model +{args.model}+")
+            print("\nAfter running one of these commands, try this command again.")
             return 1
         except json.JSONDecodeError as e:
             print(f"Error: Invalid JSON in lineage file: {e}")
             return 1
 
-        # Resolve model name to full node path if needed
+        # Use model name as provided - must be full node path
         model_node = args.model
-        # Check if this is not already a full node path
-        if not model_node.startswith(('model.', 'source.')):
-            # First try to find the model in lineage files
-            matching_nodes_from_lineage = find_model_in_lineage(lineage_to_direct_parents, model_node)
-            
-            if matching_nodes_from_lineage:
-                if len(matching_nodes_from_lineage) > 1:
-                    print(f"Warning: Multiple models match '{model_node}' in lineage files. Using the first match: {matching_nodes_from_lineage[0]}")
-                model_node = matching_nodes_from_lineage[0]
-                print(f"Resolved model name '{args.model}' to full node path from lineage file: {model_node}")
-            else:
-                # Try alternate lineage file if first one didn't have matches
-                matching_nodes_from_lineage = find_model_in_lineage(lineage_to_direct_children, model_node)
-                if matching_nodes_from_lineage:
-                    if len(matching_nodes_from_lineage) > 1:
-                        print(f"Warning: Multiple models match '{model_node}' in lineage files. Using the first match: {matching_nodes_from_lineage[0]}")
-                    model_node = matching_nodes_from_lineage[0]
-                    print(f"Resolved model name '{args.model}' to full node path from lineage file: {model_node}")
-                else:
-                    # If not found in lineage files, try to use manifest if available
-                    try:
-                        # First check if manifest and catalog files exist
-                        if not os.path.exists(args.manifest):
-                            print(f"Warning: Manifest file not found at '{args.manifest}'. Cannot resolve model name.")
-                            print("Proceeding with the original model name. If this is incorrect, please provide a full node path (e.g., model.package.model_name)")
-                        elif not os.path.exists(args.catalog):
-                            print(f"Warning: Catalog file not found at '{args.catalog}'. Cannot resolve model name.")
-                            print("Proceeding with the original model name. If this is incorrect, please provide a full node path (e.g., model.package.model_name)")
-                        else:
-                            extractor = DbtColumnLineageExtractor(
-                                manifest_path=args.manifest,
-                                catalog_path=args.catalog
-                            )
-                            # Try to resolve the model name
-                            matching_nodes = extractor._resolve_node_by_name(model_node)
-                            if matching_nodes:
-                                if len(matching_nodes) > 1:
-                                    print(f"Warning: Multiple models match '{model_node}'. Using the first match: {matching_nodes[0]}")
-                                model_node = matching_nodes[0]
-                                print(f"Resolved model name '{args.model}' to full node path: {model_node}")
-                            else:
-                                print(f"Warning: Could not find any model with name '{model_node}' in the manifest file.")
-                                print("Proceeding with the original model name. If this is incorrect, please provide a full node path (e.g., model.package.model_name)")
-                    except Exception as e:
-                        print(f"Warning: Error when trying to resolve model name: {e}")
-                        print("Proceeding with the original model name. If this is incorrect, please provide a full node path (e.g., model.package.model_name)")
 
         # Check if model exists in lineage files
         if model_node not in lineage_to_direct_parents and model_node not in lineage_to_direct_children:
-            print(f"Warning: Model '{model_node}' not found in lineage files. Results may be empty or incomplete.")
+            # Search for potential matches in both lineage files
+            parent_matches = utils.find_potential_matches(lineage_to_direct_parents, model_node)
+            child_matches = utils.find_potential_matches(lineage_to_direct_children, model_node)
+            
+            # Combine unique matches
+            all_matches = list(set(parent_matches + child_matches))
+            
+            if all_matches:
+                if len(all_matches) > 1:
+                    print("\nError: Multiple potential matches found. Please use one of the following full node paths:")
+                    for match in sorted(all_matches):
+                        print(f"  - {match}")
+                    return 1
+                else:
+                    # Single match found - use it automatically
+                    model_node = all_matches[0]
+                    print(f"Using model: {model_node}")
+            else:
+                print(f"No matches found for '{model_node}'. Results may be empty or incomplete.")
 
         print("========================================")
         # Find all ancestors for a specific model and column
@@ -106,17 +77,14 @@ def main():
             lineage_to_direct_parents, model_node, args.column
         )
 
-        print("---squashed ancestors---")
-        utils.pretty_print_dict(ancestors_squashed)
-        print("---structured ancestors---")
-        utils.pretty_print_dict(ancestors_structured)
-
-        # Save ancestors to files
-        ancestors_file = os.path.join(args.output_dir, f"{model_node}_{args.column}_ancestors.json")
-        utils.write_dict_to_file(ancestors_structured, ancestors_file)
+        if args.show_details:
+            print("---squashed ancestors---")
+            utils.pretty_print_dict(ancestors_squashed)
+            print("---structured ancestors---")
+            utils.pretty_print_dict(ancestors_structured)
 
-        print("========================================")
         # Find all descendants for a specific model and column
+        print("========================================")
         print(f"Finding all descendants of {model_node}.{args.column}:")
         descendants_squashed = DbtColumnLineageExtractor.find_all_related(
             lineage_to_direct_children, model_node, args.column
@@ -125,25 +93,48 @@ def main():
             lineage_to_direct_children, model_node, args.column
         )
 
-        print("---squashed descendants---")
-        utils.pretty_print_dict(descendants_squashed)
-        print("---structured descendants---")
-        utils.pretty_print_dict(descendants_structured)
+        if args.show_details:
+            print("---squashed descendants---")
+            utils.pretty_print_dict(descendants_squashed)
+            print("---structured descendants---")
+            utils.pretty_print_dict(descendants_structured)
 
-        # Save descendants to files
-        descendants_file = os.path.join(args.output_dir, f"{model_node}_{args.column}_descendants.json")
-        utils.write_dict_to_file(descendants_structured, descendants_file)
+        # Save outputs based on format
+        if args.output_format in ['json', 'both']:
+            # Create safe filenames by replacing dots with underscores
+            safe_model_name = model_node.replace('.', '_')
+            
+            # Save ancestors to files
+            ancestors_file = os.path.join(args.output_dir, f"{safe_model_name}_{args.column}_ancestors.json")
+            utils.write_dict_to_file(ancestors_structured, ancestors_file)
+            
+            # Save descendants to files
+            descendants_file = os.path.join(args.output_dir, f"{safe_model_name}_{args.column}_descendants.json")
+            utils.write_dict_to_file(descendants_structured, descendants_file)
+            
+            print("========================================")
+            print(f"Lineage outputs saved to {ancestors_file} and {descendants_file}")
 
-        print("========================================")
-        print(
-            "You can use the structured ancestors and descendants to programmatically use the lineage, "
-            "such as for impact analysis, data tagging, etc."
-        )
-        print(
-            "Or, you can copy the json outputs to tools like https://github.com/AykutSarac/jsoncrack.com, "
-            "https://jsoncrack.com/editor to visualize the lineage"
-        )
-        print(f"Lineage outputs saved to {ancestors_file} and {descendants_file}")
+        if args.output_format in ['mermaid', 'both']:
+            # Convert to Mermaid format
+            mermaid_output = convert_to_mermaid(model_node, args.column, ancestors_structured, descendants_structured)
+            
+            # Save Mermaid output
+            mermaid_file = os.path.join(args.output_dir, f"{model_node}_{args.column}_lineage.mmd")
+            with open(mermaid_file, 'w') as f:
+                f.write(mermaid_output)
+            
+            # Always create HTML viewer for Mermaid output
+            viewer_file = create_html_viewer(mermaid_output, args.output_dir, model_node, args.column)
+            
+            print(f"Mermaid output saved to {mermaid_file}")
+            print(f"HTML viewer created at: {viewer_file}")
+            
+            # Open the viewer by default unless --no-ui is specified
+            if not args.no_ui:
+                print("Opening Mermaid diagram in local viewer...")
+                webbrowser.open(f"file://{os.path.abspath(viewer_file)}")
+        
         return 0
     except Exception as e:
         print(f"Error: {str(e)}")
 
@@ -23,3 +23,8 @@ def write_dict_to_file(dict_to_write, file_path):
 def read_dict_from_file(file_path):
     with open(file_path, "r") as file:
         return json.load(file)
+
+def find_potential_matches(lineage_data, model_name):
+    """Find potential model matches based on partial name match."""
+    model_name = model_name.lower()
+    return [model for model in lineage_data.keys() if model_name in model.lower()]