-
Notifications
You must be signed in to change notification settings - Fork 0
/
json2csv2.py
executable file
·148 lines (97 loc) · 3.67 KB
/
json2csv2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import json
import pandas as pd
import re
data_train_read=open("./json/Big.json",encoding="UTF-8")
data={}
data['fact']=[] #text
data['accusation']=[] #罪名
data['relevant_articles_label']=[]#相关法条
data['death_penalty']=[]#是否死刑
data['life_imprisonment']=[]#是否无期
data['imprisonment']=[]# 有期徒刑刑期
for (i,line) in enumerate(data_train_read):
if i%10000==0:
print("processed {} of 1710000".format(i))
try:
a=json.loads(line)
except:
print("error json")
continue
data['fact'].append(a['fact'])
data['accusation'].append(a['meta']['accusation'])
data['relevant_articles_label'].append(a['meta']['relevant_articles'])
data['death_penalty'].append(a['meta']['term_of_imprisonment']['death_penalty'])
data['life_imprisonment'].append(a['meta']['term_of_imprisonment']['life_imprisonment'])
data['imprisonment'].append(a['meta']['term_of_imprisonment']['imprisonment'])
data=pd.DataFrame(data)
import re
import jieba
def preprocessing_fact(text):
#just keep wd
text = text.replace(" ","")
text = re.sub(r"\d*年", "", text)
text = re.sub(r'\d*月','',text)
text = re.sub(r'\d*日','',text)
text = re.sub(r'\d*时','',text)
text = re.sub(r"\d*分", "", text)
text = re.sub(r"\r\n", "", text)
text = re.sub(r"\.\d+","",text)
text = re.sub(r".\d某","某某",text)
text = re.sub(r".某\d","某某",text)
text = re.sub(r'万.','万元',text)
# g
text=re.sub(r'(?<=\d)g','克',text)
#inphone规整
text = re.sub(r'inphone\d?[a-zA-Z]?','inphone',text)
#手机号码的规整
text = re.sub(r'\d+×+\d+','SJHAO',text)
#车牌规整
text = re.sub(r'[a-wy-zA-WY-Z]+\d?(×|x|X)+','CHEPAIHAO',text)
text = re.sub(r'[a-wy-zA-WY-Z]+\d+[a-zA-Z]?','CHEPAIHAO',text)
#jieba无法shibie mg/ml需要添加自定义字典
jieba.add_word('mg')
jieba.add_word('kg')
jieba.add_word('ml')
jieba.add_word('iPhone')
ch2num={}
ch2num['一']=1
ch2num['二']=2
ch2num['三']=3
ch2num['四']=4
ch2num['五']=5
ch2num['六']=6
ch2num['七']=7
ch2num['八']=8
ch2num['九']=9
text=[i for i in jieba.lcut(text)]
text_len=len(text)
normal_words=['市','县','镇','村','区','街道']
for i in range(text_len-1):
#对地区规则化
for j in normal_words:
if j in text[i]:
text[i]=j
#人名规则化
if '某' in text[i]:
text[i]='某某'
if len(text[i])==1 and '某' in text[i+1]:
text[i]=''
if 'x' in text[i+1] and len(text[i])==1:
text[i]='某某'
text[i+1]=''
key_wd=['万元','元','美元','克','mg','kg','千克','公斤','斤']
if text[i+1] in key_wd and text[i].isdigit():
text[i]=str(text[i][0])+"0"*(len(text[i])-1)
num_i=float(text[i])
for i in num_thr_list:
if i>num_thr_list:
text[i]=i
break
text[i]=text[i]+“ ”+str(num_i)
if len(text[i])==1 and text[i].isdigit()==False and text[i] not in ['克','元','刀','性','斤','枪']:
text[i]=''
text=[i for i in text if i!='']
return " ".join(text)
data['fact_cut_wd']=data['fact'].map(lambda x:preprocessing_fact(x))
# In[ ]:
data.to_csv('./csv/Big.csv',index=False)