1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| import requests import re import os from subprocess import call
def urls_iter(origin_url='http://v.163.com/special/opencourse/algorithms.html', begin_from=1): """ *origin_url* is the origin download page of 163 OpenCourse, the default value is 'algorithms course from MIT'. *begin_from* can make the function jump to the No.*begin_from* item to start download. """ download_request = requests.get(origin_url)
course_list_pattern = re.compile(r'<table class="m-clist" id="list2".*?>(.*?)</table>', flags=re.DOTALL) course_pattern = re.compile(r'<tr class="(?:u-odd|u-even)">.*?</tr>', flags=re.DOTALL) course_text = course_list_pattern.search(download_request.text).group(1) course_list = course_pattern.findall(course_text)
course_name_pattern = re.compile(r'<a href=.*?>(.*?)</a>', flags=re.DOTALL) course_video_pattern = re.compile(r'<a class="downbtn" href=[\'""](.*?)[\'""].*?>.*?</a>', flags=re.DOTALL)
for index, course in enumerate(course_list, 1): if index < begin_from: continue
index = '{:0>2}_'.format(index) video_title = index + course_name_pattern.search(course).group(1) + '.mp4' video_address = course_video_pattern.search(course).group(1) yield (video_title, video_address)
def download_course(download_list, download_dir='/Users/zealot/Downloads/algorithms'): """ *download_list* is a collection contains a list of tuple whose 1st element is the filename of the video, and 2nd element is the download url of the video. *download_dir* defines the directory where the files should be stored in.
function using common download tool `wget` to fetch videos one at a time. raise error if `wget` not found. """ with open('/dev/null') as black_hole: if call(['which', 'wget'], stdout=black_hole): raise OSError('command not found: wget')
if not os.path.exists(download_dir): os.makedirs(download_dir)
for video_title, video_address in download_list: call(['wget', '-c', video_address, '-O', os.path.join(download_dir, video_title)])
if __name__ == '__main__': download_course(urls_iter())
|