forked from sdgilley/learn-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
120 lines (109 loc) · 4.57 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# some functions to use for find-snippets, pr-report, and merge-report
# function to clean up the matches
# syntax of a match is different if it is from a notebook vs. code files.
# returns info about the match:
# path to the file in azureml-examples
# file name of the file being referenced
# branch used to find the file(i.e., azureml-examples-main)
# match - the full match
# name - the name of the notebook cell
def cleanup_matches(match):
import os
match= match.replace('(', '').replace(')', '').replace('"', '').replace(',', '').replace('source=', '')
#print(f"** match is {match}")
# split up the match into parts here.
path = os.path.dirname(match)
ref_file = os.path.basename(match)
# the first part of the path, after ~/, is the "path-to-root" which includes the branch name
# path-to-root is configured in azure-docs-pr/.openpublishing.publish.config.json
branch = path.split('/')[1]
# remove the branch info to get the path to the file in azureml-examples
path = path.replace('~/', '')
if path == branch:
path = ''
else:
path = path.replace(f"{branch}/",'')
if "?" in ref_file: # split out the id name from the ref_file if it exists
ref_file, name = ref_file.split('?',1)
else:
name = ''
if path != '': # if the path is empty, we don't want a beginning slash.
ref_file = f"{path}/{ref_file}" # add the path to the ref_file
ref_file = ref_file.replace('///', '/').replace('//','/') # get rid of triple or double slashes
return(path, ref_file, branch, match, name) # right now, not using match and name. But might in the future
# this function gets the changes for a specific file in a PR.
# Then searches for notebook cells or code snippets the were added/deleted.
# Returns a tuple with a boolean for whether the file is a notebook,
# a list of added cells, and a list of deleted cells.
def find_changes(thisfile, prfiles):
import re
patch = [file['patch'] for file in prfiles if file['filename'] == thisfile]
nb_cell = r'(\\n[\+-])\s*"name":\s*"([^"]*)"' # finds added or deleted cells with a name
code_cell = r'(\\n[\+-])\s*(#\s*<[^>]*>)' # finds lines that start with # <> or # </>
# only works for files that use # as comment.
adds = []
deletes = []
nb = False
if thisfile.endswith('.ipynb'):
nb = True
matches = re.findall(nb_cell, str(patch))
else:
matches = re.findall(code_cell, str(patch))
for match in matches:
if match[0] == "\\n+":
adds.append(match[1])
elif match[0 == "\\n-"]:
deletes.append(match[1])
else:
print("ERROR in utilities.py find_changes. The match was not an add or delete.")
return(nb, adds, deletes)
# function to read local file - try utf-8 first, then latin-1
def read_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as target_file:
lines = target_file.readlines()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as target_file:
lines = target_file.readlines()
except Exception as e:
print(f"Error reading {file_path}: {e}")
lines = []
return lines
def read_snippets():
import os
import sys
import pandas as pd
# read the snippets file
fn = "refs-found.csv"
mydir = os.path.abspath(__file__)
snippet_fn = os.path.join(os.path.dirname(mydir), fn)
# Check if snippets file exists
if os.path.exists(snippet_fn):
snippets = pd.read_csv(snippet_fn)
else:
print(f"{snippet_fn} does not exist.")
print("Run 'find-snippets.py' to create the file.")
sys.exit()
return snippets
# function to connect to GitHub repo
def connect_repo(repo_name):
import os
import sys
from github import Github
try:
token = os.environ['GH_ACCESS_TOKEN']
except:
print("Please set GH_ACCESS_TOKEN environment variable")
sys.exit()
g = Github(token)
repo = g.get_repo(repo_name)
return repo
# function to compare file on two branches in a
def compare_branches(repo, file, branch1, branch2):
file_b1 = repo.get_contents(file, ref=branch1)
file_b2 = repo.get_contents(file, ref=branch2)
if file_b1.sha == file_b2.sha:
print(f"*azureml-examples {branch2} branch has the same version of this file as {branch1}\n")
else:
print(f"*azureml-examples {branch2} branch has a DIFFERENT version of this file from {branch1}\n")