-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_test_tsv_agregate.py
86 lines (55 loc) · 2.14 KB
/
_test_tsv_agregate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# import _utils as u
import pandas as pd
# YT_VIDEO_ID = 'Stu8h5cVzoQ'
# YT_VIDEO_ID = 'YZhaZCLresQ'
# YT_VIDEO_ID = 'VOff0Uhr8X0'
# YT_VIDEO_ID = 'Uc75PYSYPpI'
# YT_VIDEO_ID = 'nJ660t5ku9A'
# YT_VIDEO_ID = 'tNrlSai6JGA'
# YT_LINK = f'https://www.youtube.com/watch?v={YT_VIDEO_ID}'
# tsv = pd.read_csv(f'out/e118b05411965199a84d288c5279633a.ru.tsv', sep='\t')
# # tsv = tsv.head(10)
# tsv['text_len']=tsv['text'].apply(lambda x :len(x))
# tsv['start_sec']=tsv['start'].apply(lambda x :x/1000)
# # print(tsv)
# groups = []
# group = 0
# cumsum = 0
# for n in tsv["text_len"]:
# if cumsum >= 1000:
# cumsum = 0
# group = group + 1
# cumsum = cumsum + n
# groups.append(group)
# # print(groups)
# new = tsv.groupby(groups).agg({'text':' '.join, 'start_sec': lambda x: x.min().round().astype(int)})
# # print(new)
# # print(new["text"][26])
# f = open(f"out/{YT_VIDEO_ID}.md", "w",encoding='utf8')
# for index, row in new.iterrows():
# str = f' - ~~[▶]({YT_LINK}&t={row["start_sec"]})~~ {row["text"]} \n'
# f.write(str)
# f.close()
def tsv_to_md(file_path , url) -> str:
# TODO what if url whil contain &t= as parameter?
ret = ''
tsv = pd .read_csv(file_path, sep='\t')
tsv['text_len'] = tsv['text'] .apply(lambda x :len(x))
tsv['start_sec'] = tsv['start'] .apply(lambda x :x/1000)
groups = []
group = 0
cumsum = 0
for n in tsv["text_len"]:
if cumsum >= 1000:
cumsum = 0
group = group + 1
cumsum = cumsum + n
groups.append(group)
# print(groups)
new = ( tsv .groupby(groups)
.agg({ 'text' :' '.join
, 'start_sec' : lambda x: x.min().round().astype(int)})
)
for index, row in new.iterrows():
ret += f' - ~~[▶]({url}&t={row["start_sec"]})~~ {row["text"]} \n'
return ret