Skip to content

Commit fa63423

Browse files
committed
feature: Update dbcat version to 0.6.1
This dependency update provide sql migrations using Alembic as well as managed sessions to eliminate chances of leaked database connections. Other minor improvements: * Remove idea config files and setup.py * Fix path in README * Update version 0.8.0 * Fix code in example.py to analyze queries. Wrong API was used.
1 parent 070557b commit fa63423

20 files changed

+297
-382
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
66
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
77

8+
.idea
9+
810
# User-specific stuff
911
.idea/**/workspace.xml
1012
.idea/**/tasks.xml

.idea/.gitignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

.idea/data-lineage.iml

Lines changed: 0 additions & 18 deletions
This file was deleted.

.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 0 additions & 6 deletions
This file was deleted.

.idea/misc.xml

Lines changed: 0 additions & 7 deletions
This file was deleted.

.idea/modules.xml

Lines changed: 0 additions & 8 deletions
This file was deleted.

.idea/vcs.xml

Lines changed: 0 additions & 6 deletions
This file was deleted.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Download the docker-compose file from Github repository.
3535
# in a new directory run
3636
wget https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml
3737
# or run
38-
curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/catalog-demo.yml -o docker-compose.yml
38+
curl https://raw.githubusercontent.com/tokern/data-lineage/master/install-manifests/docker-compose/tokern-lineage-engine.yml -o docker-compose.yml
3939

4040

4141
Run docker-compose

data_lineage/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# flake8: noqa
2-
__version__ = "0.7.8"
2+
__version__ = "0.8.0"
33

44
import datetime
55
import json

data_lineage/server.py

Lines changed: 71 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import flask_restless
66
import gunicorn.app.base
7-
from dbcat import Catalog
7+
from dbcat import Catalog, init_db
88
from dbcat.catalog import CatColumn
99
from dbcat.catalog.db import DbScanner
1010
from dbcat.catalog.models import (
@@ -66,23 +66,24 @@ def get(self):
6666
edges = []
6767

6868
args = self._parser.parse_args()
69-
column_edges = self._catalog.get_column_lineages(args["job_ids"])
70-
for edge in column_edges:
71-
nodes.append(self._column_info(edge.source))
72-
nodes.append(self._column_info(edge.target))
73-
nodes.append(self._job_info(edge.job_execution.job))
74-
edges.append(
75-
{
76-
"source": "column:{}".format(edge.source_id),
77-
"target": "task:{}".format(edge.job_execution.job_id),
78-
}
79-
)
80-
edges.append(
81-
{
82-
"source": "task:{}".format(edge.job_execution.job_id),
83-
"target": "column:{}".format(edge.target_id),
84-
}
85-
)
69+
with self._catalog.managed_session:
70+
column_edges = self._catalog.get_column_lineages(args["job_ids"])
71+
for edge in column_edges:
72+
nodes.append(self._column_info(edge.source))
73+
nodes.append(self._column_info(edge.target))
74+
nodes.append(self._job_info(edge.job_execution.job))
75+
edges.append(
76+
{
77+
"source": "column:{}".format(edge.source_id),
78+
"target": "task:{}".format(edge.job_execution.job_id),
79+
}
80+
)
81+
edges.append(
82+
{
83+
"source": "task:{}".format(edge.job_execution.job_id),
84+
"target": "column:{}".format(edge.target_id),
85+
}
86+
)
8687

8788
return {"nodes": nodes, "edges": edges}
8889

@@ -106,14 +107,12 @@ def __init__(self, catalog: Catalog):
106107
self._parser.add_argument("id", required=True, help="ID of the resource")
107108

108109
def post(self):
109-
try:
110-
args = self._parser.parse_args()
111-
logging.debug("Args for scanning: {}".format(args))
110+
args = self._parser.parse_args()
111+
logging.debug("Args for scanning: {}".format(args))
112+
with self._catalog.managed_session:
112113
source = self._catalog.get_source_by_id(int(args["id"]))
113114
DbScanner(self._catalog, source).scan()
114115
return "Scanned {}".format(source.fqdn), 200
115-
finally:
116-
self._catalog.scoped_session.remove()
117116

118117

119118
class Parse(Resource):
@@ -134,27 +133,26 @@ def post(self):
134133
raise ParseErrorHTTP(description=str(error))
135134

136135
try:
137-
source = self._catalog.get_source_by_id(args["source_id"])
138-
logging.debug("Parsing query for source {}".format(source))
139-
binder = parse_dml_query(
140-
catalog=self._catalog, parsed=parsed, source=source
141-
)
142-
143-
return (
144-
{
145-
"select_tables": [table.name for table in binder.tables],
146-
"select_columns": [context.alias for context in binder.columns],
147-
},
148-
200,
149-
)
136+
with self._catalog.managed_session:
137+
source = self._catalog.get_source_by_id(args["source_id"])
138+
logging.debug("Parsing query for source {}".format(source))
139+
binder = parse_dml_query(
140+
catalog=self._catalog, parsed=parsed, source=source
141+
)
142+
143+
return (
144+
{
145+
"select_tables": [table.name for table in binder.tables],
146+
"select_columns": [context.alias for context in binder.columns],
147+
},
148+
200,
149+
)
150150
except TableNotFound as table_error:
151151
raise TableNotFoundHTTP(description=str(table_error))
152152
except ColumnNotFound as column_error:
153153
raise ColumnNotFoundHTTP(description=str(column_error))
154154
except SemanticError as semantic_error:
155155
raise SemanticErrorHTTP(description=str(semantic_error))
156-
finally:
157-
self._catalog.scoped_session.remove()
158156

159157

160158
class Analyze(Resource):
@@ -182,45 +180,44 @@ def post(self):
182180
raise ParseErrorHTTP(description=str(error))
183181

184182
try:
185-
source = self._catalog.get_source_by_id(args["source_id"])
186-
logging.debug("Parsing query for source {}".format(source))
187-
chosen_visitor = analyze_dml_query(self._catalog, parsed, source)
188-
job_execution = extract_lineage(
189-
catalog=self._catalog,
190-
visited_query=chosen_visitor,
191-
source=source,
192-
parsed=parsed,
193-
start_time=datetime.datetime.fromisoformat(args["start_time"]),
194-
end_time=datetime.datetime.fromisoformat(args["end_time"]),
195-
)
196-
197-
return (
198-
{
199-
"data": {
200-
"id": job_execution.id,
201-
"type": "job_executions",
202-
"attributes": {
203-
"job_id": job_execution.job_id,
204-
"started_at": job_execution.started_at.strftime(
205-
"%Y-%m-%d %H:%M:%S"
206-
),
207-
"ended_at": job_execution.ended_at.strftime(
208-
"%Y-%m-%d %H:%M:%S"
209-
),
210-
"status": job_execution.status.name,
211-
},
212-
}
213-
},
214-
200,
215-
)
183+
with self._catalog.managed_session:
184+
source = self._catalog.get_source_by_id(args["source_id"])
185+
logging.debug("Parsing query for source {}".format(source))
186+
chosen_visitor = analyze_dml_query(self._catalog, parsed, source)
187+
job_execution = extract_lineage(
188+
catalog=self._catalog,
189+
visited_query=chosen_visitor,
190+
source=source,
191+
parsed=parsed,
192+
start_time=datetime.datetime.fromisoformat(args["start_time"]),
193+
end_time=datetime.datetime.fromisoformat(args["end_time"]),
194+
)
195+
196+
return (
197+
{
198+
"data": {
199+
"id": job_execution.id,
200+
"type": "job_executions",
201+
"attributes": {
202+
"job_id": job_execution.job_id,
203+
"started_at": job_execution.started_at.strftime(
204+
"%Y-%m-%d %H:%M:%S"
205+
),
206+
"ended_at": job_execution.ended_at.strftime(
207+
"%Y-%m-%d %H:%M:%S"
208+
),
209+
"status": job_execution.status.name,
210+
},
211+
}
212+
},
213+
200,
214+
)
216215
except TableNotFound as table_error:
217216
raise TableNotFoundHTTP(description=str(table_error))
218217
except ColumnNotFound as column_error:
219218
raise ColumnNotFoundHTTP(description=str(column_error))
220219
except SemanticError as semantic_error:
221220
raise SemanticErrorHTTP(description=str(semantic_error))
222-
finally:
223-
self._catalog.scoped_session.remove()
224221

225222

226223
class Server(gunicorn.app.base.BaseApplication):
@@ -289,6 +286,8 @@ def create_server(
289286
pool_pre_ping=True
290287
)
291288

289+
init_db(catalog)
290+
292291
restful_catalog = Catalog(
293292
**catalog_options,
294293
connect_args={"application_name": "data-lineage:restful"},
@@ -300,7 +299,7 @@ def create_server(
300299
# Create CRUD APIs
301300
methods = ["DELETE", "GET", "PATCH", "POST"]
302301
url_prefix = "/api/v1/catalog"
303-
api_manager = flask_restless.APIManager(app, catalog.scoped_session)
302+
api_manager = flask_restless.APIManager(app, catalog.get_scoped_session())
304303
api_manager.create_api(
305304
CatSource,
306305
methods=methods,

example.ipynb

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
},
4545
{
4646
"cell_type": "code",
47-
"execution_count": null,
47+
"execution_count": 1,
4848
"metadata": {},
4949
"outputs": [],
5050
"source": [
@@ -62,9 +62,20 @@
6262
},
6363
{
6464
"cell_type": "code",
65-
"execution_count": null,
65+
"execution_count": 2,
6666
"metadata": {},
67-
"outputs": [],
67+
"outputs": [
68+
{
69+
"data": {
70+
"text/plain": [
71+
"True"
72+
]
73+
},
74+
"execution_count": 2,
75+
"metadata": {},
76+
"output_type": "execute_result"
77+
}
78+
],
6879
"source": [
6980
"# Setup a connection to catalog using the SDK.\n",
7081
"from data_lineage import Catalog\n",
@@ -82,7 +93,7 @@
8293
},
8394
{
8495
"cell_type": "code",
85-
"execution_count": null,
96+
"execution_count": 3,
8697
"metadata": {},
8798
"outputs": [],
8899
"source": [
@@ -94,19 +105,39 @@
94105
},
95106
{
96107
"cell_type": "code",
97-
"execution_count": null,
108+
"execution_count": 5,
98109
"metadata": {
99110
"scrolled": true
100111
},
101-
"outputs": [],
112+
"outputs": [
113+
{
114+
"name": "stdout",
115+
"output_type": "stream",
116+
"text": [
117+
"{'name': 'LOAD page_lookup_nonredirect', 'query': 'INSERT INTO page_lookup_nonredirect SELECT page.page_id as redircet_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page LEFT OUTER JOIN redirect ON page.page_id = redirect.rd_from WHERE redirect.rd_from IS NULL '}\n"
118+
]
119+
},
120+
{
121+
"ename": "TypeError",
122+
"evalue": "analyze() missing 2 required positional arguments: 'start_time' and 'end_time'",
123+
"output_type": "error",
124+
"traceback": [
125+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
126+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
127+
"\u001b[0;32m/tmp/ipykernel_2259588/1883341295.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mqueries\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0manalyze\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0manalyze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msource\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
128+
"\u001b[0;31mTypeError\u001b[0m: analyze() missing 2 required positional arguments: 'start_time' and 'end_time'"
129+
]
130+
}
131+
],
102132
"source": [
103-
"from data_lineage import Parser\n",
133+
"from datetime import datetime\n",
134+
"from data_lineage import Analyze\n",
104135
"\n",
105-
"parser = Parser(docker_address)\n",
136+
"analyze = Analyze(docker_address)\n",
106137
"\n",
107138
"for query in queries:\n",
108139
" print(query)\n",
109-
" parser.parse(**query, source=source)"
140+
" analyze.analyze(**query, source=source, start_time=datetime.now(), end_time=datetime.now())"
110141
]
111142
},
112143
{
@@ -128,7 +159,7 @@
128159
],
129160
"metadata": {
130161
"kernelspec": {
131-
"display_name": "Python 3",
162+
"display_name": "Python 3 (ipykernel)",
132163
"language": "python",
133164
"name": "python3"
134165
},
@@ -142,7 +173,7 @@
142173
"name": "python",
143174
"nbconvert_exporter": "python",
144175
"pygments_lexer": "ipython3",
145-
"version": "3.8.5"
176+
"version": "3.8.10"
146177
}
147178
},
148179
"nbformat": 4,

0 commit comments

Comments
 (0)