-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpre_processing.py
136 lines (108 loc) · 5.43 KB
/
pre_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3.6
import os
import pandas as pd
#from sqlalchemy import true
from wordcloud import WordCloud
from common import *
import nltk
import codecs, json
import math
nltk.download('wordnet')
#### Useful functions
def preprocess_spelling(input_list, split_by_comma=True, camel_case_to_spaces=True, underscore_to_spaces=True, spaces_to_underscores=False,
to_lowercase=True, remove_words = True, remove_dash = True, change_and = True, column_name=''):
# If errors appear that means there might be an empty word after the ,
if split_by_comma:
output_list = [s.split(',') for s in input_list if s not in ['-', '']] # split by comma
output_list = [item for sublist in output_list for item in sublist] # flatten list of list
if change_and:
output_list = [n.strip().replace('&', ' ') for n in output_list] # replace spaces with underscores
if remove_dash:
output_list = [n.strip().replace('-', ' ') for n in output_list] # replace spaces with underscores
if camel_case_to_spaces:
output_list = [camel_case_split(s) for s in output_list] # resolve camel case into spaces
if underscore_to_spaces:
output_list = [n.strip().replace('_', ' ') for n in output_list] # replace spaces with underscores
if spaces_to_underscores:
output_list = [n.strip().replace(' ', '_') for n in output_list] # replace spaces with underscores
if to_lowercase:
output_list = [s.lower() for s in output_list] # make all lower case
if remove_words:
output_list = [n.strip().replace('robot ', ' ') for n in output_list] # take care of trailing space!
output_list = [n.strip().replace('robotic ', ' ') for n in output_list] # take care of trailing space!
output_list = [n.strip().replace(column_name + 's', '') for n in output_list] # remove plural form of column name
output_list = [n.strip().replace(column_name, '') for n in output_list] # remove singular form of column name
return output_list
############ MAIN FILE ############
#### Read file with dataframe
#resultsFile = "skill-taxonomy-extraction/data/in/20220127_skillTaxonomy.csv"
this_file_dir = os.path.dirname(os.path.realpath(__file__))
resultsFile = os.path.join(this_file_dir, "../data/in/20220223_skillTaxonomy.csv")
taxonomy = pd.read_csv(resultsFile, delimiter=';')
taxonomy = taxonomy.loc[taxonomy['relevant'] == 'Y']
# taxonomy = taxonomy.loc[taxonomy['ind'] == 'X'] # to select industrial or non industrial
taxonomyExp = pd.DataFrame(columns=['author', 'link', 'relevant', 'how', 'requirements', 'ind', 'hier',
'similarity', 'skillclass', 'identified skillclass', 'skill',
'identified skill', 'primitive', 'identified primitive',
'parametrizedskill', 'identified parametrizedskill', 'task',
'identified task', 'request', 'identified request', 'process',
'identified process', 'arch', 'impl', 'param', 'paramtype', 'bibtex'])
column = 'identified primitive'
localIdx = 0
first = True
for idx, row in taxonomy.iterrows():
if first:
localIdx = idx
first = False
length = len(row[column].split(','))
if length > 1:
for internal in range(length):
taxonomyExp.loc[localIdx] = taxonomy.loc[idx]
localIdx = localIdx + 1
else:
taxonomyExp.loc[localIdx] = taxonomy.loc[idx]
localIdx += 1
# Do tests on data integrity
def check_data_integrity(taxonomyExp):
def has_digits(x):
if any(c.isdigit() for c in x):
raise Exception(f"Field contains digits in : {x}")
def has_too_short_word(x):
x_list = x.split(' ')
allowed_short_words = ("-", ",", "in", "up", "to", "on", "of", "at", "or")
if any(w for w in x_list if len(w) < 3 and w.lower() not in allowed_short_words):
raise Exception(f"Field contains too short word in: {x}")
def has_parenthesis(x):
if '(' in x or ')' in x:
raise Exception(f"Field contains at least one parenthesis (, ): {x}")
for key in ('identified task', 'identified skill', 'identified primitive'):
col = taxonomyExp[key]
col.apply(has_digits)
col.apply(has_too_short_word)
col.apply(has_parenthesis)
check_data_integrity(taxonomyExp)
taxonomyExp = taxonomyExp.loc[taxonomyExp[column] != '-']
print("data frame length: " + str(len(taxonomyExp[column].to_list())))
# the taxonomyExp dataframe has the same amount of rows as the one excorporated by the preprocessing if - are discarded
#### Preprocess the data by cleaning and lematization
v_lemmatizer = np.vectorize(lemmatizer)
outputList = preprocess_spelling(input_list=taxonomy[column].dropna(), column_name=column.split(' ')[-1])
lemmaOutputList = v_lemmatizer(outputList)
# unique names have not been sorted
print("processed length: " + str(len(lemmaOutputList)))
# save pkl file for finding cluster related papers
taxonomyExp.to_pickle(os.path.join(this_file_dir, "..", "data/in/" + column.split(' ')[-1] + '.pkl'))
jsonString = json.dumps(lemmaOutputList.tolist())
#with open('D:/1. Papers/4. MyPapers/6_(20210609) Skill taxonomy/skill-taxonomy-extraction/data/in/' + column.split(' ')[-1] + 'Def.json', 'w') as outfile:
with open(os.path.join(this_file_dir, "..", "data/in/" + column.split(' ')[-1] + 'Def.json'), 'w') as outfile:
outfile.write(jsonString)
### extracted
# 63 TASK
# 296 SKILL
# 167 PRIMITIVE
# 526 TOTAL
# 329 SINGLE
# 132 SINGLE PRIMITIVES
# 198 SINGLE SKILLS
# 63 SINGLE TASKS
###