Skip to content

Commit c80b7b9

Browse files
committed
init
0 parents  commit c80b7b9

File tree

3 files changed

+71
-0
lines changed

3 files changed

+71
-0
lines changed

.gitignore

Whitespace-only changes.

README.md

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# aws-glue-pyspark-fetch-databases-and-tables-metadata-playground
2+
3+
example AWS Glue pyspark job script that fetches all the catalog databases and tables metadata.
4+
5+
* first method uses [spark sql](https://spark.apache.org/sql/)
6+
* second method uses [python boto3 Glue client](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html) to interact with Glue API directly
7+
8+
see [`main.py`](main.py)
9+
10+
## Notes
11+
12+
ensure `--enable-glue-datacatalog` is enabled for glue job to allow spark sql to access metadata catalog
13+
14+
![](https://www.evernote.com/l/AAG3O9zQGjhBQYiqT7_owkUm9K-UXd0bMCEB/image.png)
15+
16+
Glue Console Script View
17+
18+
![](https://www.evernote.com/l/AAG2b5Bdis5KFbt6ijxtySgIG7e2P8jPE0UB/image.png)

main.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import sys
2+
from awsglue.transforms import *
3+
from awsglue.utils import getResolvedOptions
4+
from pyspark.context import SparkContext
5+
from awsglue.context import GlueContext
6+
from awsglue.job import Job
7+
8+
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
9+
10+
sc = SparkContext()
11+
glueContext = GlueContext(sc)
12+
spark = glueContext.spark_session
13+
job = Job(glueContext)
14+
15+
# fetch database and table metadata
16+
17+
# --- using spark sql example ---
18+
# NOTE: `--enable-glue-datacatalog` must be enabled for job
19+
def spark_sql_example():
20+
spark.sql("use default")
21+
tables = spark.sql("show tables").rdd.collect()
22+
output = ''
23+
for table in tables:
24+
output += f"-- schema for {table.tableName} ---\n"
25+
tableDescribe = spark.sql(f"describe `{table.tableName}`").rdd.collect()
26+
for column in tableDescribe:
27+
output += f"column name: {column['col_name']}, type: {column['data_type']}\n"
28+
output +="\n\n\n"
29+
30+
print(output)
31+
32+
# --- using boto3 example ---
33+
34+
import boto3
35+
import json
36+
37+
def glue_client_example():
38+
client = boto3.client('glue')
39+
databases_resp = client.get_databases()
40+
for database in databases_resp['DatabaseList']:
41+
database_name = database['Name']
42+
tables_resp = client.get_tables(DatabaseName=database_name)
43+
for table in tables_resp['TableList']:
44+
table_name = table['Name']
45+
table_resp = client.get_table(DatabaseName=database_name, Name=table_name)
46+
print(json.dumps(table_resp, indent=2, sort_keys=True, default=str))
47+
48+
def main():
49+
spark_sql_example()
50+
glue_client_example()
51+
52+
main()
53+

0 commit comments

Comments
 (0)