This repository has been archived by the owner on Sep 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 265
/
tudou.py
executable file
·77 lines (62 loc) · 2.6 KB
/
tudou.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
__all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id', 'tudou_download_by_iid']
from common import *
def tudou_download_by_iid(iid, title, merge=True):
xml = get_html('http://v2.tudou.com/v?it=' + iid + '&st=1,2,3,4,99')
from xml.dom.minidom import parseString
doc = parseString(xml)
title = title or doc.firstChild.getAttribute('tt') or doc.firstChild.getAttribute('title')
urls = [(int(n.getAttribute('brt')), n.firstChild.nodeValue.strip()) for n in doc.getElementsByTagName('f')]
url = max(urls, key=lambda x:x[0])[1]
assert 'f4v' in url
#url_save(url, filepath, bar):
download_urls([url], title, 'flv', total_size=None, merge=merge)
def tudou_download_by_id(id, title, merge=True):
html = get_html('http://www.tudou.com/programs/view/%s/' % id)
iid = r1(r'iid\s*=\s*(\S+)', html)
tudou_download_by_iid(iid, title, merge=merge)
def tudou_download(url, merge=True):
html = get_decoded_html(url)
iid = r1(r'iid\s*[:=]\s*(\d+)', html)
assert iid
title = r1(r"kw\s*[:=]\s*'([^']+)'", html)
assert title
title = unescape_html(title)
tudou_download_by_iid(iid, title, merge=merge)
def parse_playlist(url):
#if r1('http://www.tudou.com/playlist/p/a(\d+)\.html', url):
# html = get_html(url)
# print re.search(r'<script>var.*?</script>', html, flags=re.S).group()
#else:
# raise NotImplementedError(url)
raise NotImplementedError()
def parse_playlist(url):
aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)
html = get_decoded_html(url)
if not aid:
aid = r1(r"aid\s*[:=]\s*'(\d+)'", html)
if re.match(r'http://www.tudou.com/albumcover/', url):
atitle = r1(r"title\s*:\s*'([^']+)'", html)
elif re.match(r'http://www.tudou.com/playlist/p/', url):
atitle = r1(r'atitle\s*=\s*"([^"]+)"', html)
else:
raise NotImplementedError(url)
assert aid
assert atitle
import json
#url = 'http://www.tudou.com/playlist/service/getZyAlbumItems.html?aid='+aid
url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid
return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]
def tudou_download_playlist(url, create_dir=False, merge=True):
if create_dir:
raise NotImplementedError('please report a bug so I can implement this')
videos = parse_playlist(url)
for i, (title, id) in enumerate(videos):
print 'Downloading %s of %s videos...' % (i + 1, len(videos))
tudou_download_by_iid(id, title, merge=merge)
download = tudou_download
download_playlist = tudou_download_playlist
def main():
script_main('tudou', tudou_download, tudou_download_playlist)
if __name__ == '__main__':
main()