you-get/src/you_get/extractors/ucas.py

#!/usr/bin/env python

__all__ = ['ucas_download', 'ucas_download_single', 'ucas_download_playlist']

from ..common import *
import urllib.error
import requests
from time import time
from random import random
import xml.etree.ElementTree as ET
from copy import copy

"""
Do not replace request.get with get_content
for UCAS's server is not correctly returning data!
"""

def dictify(r,root=True):
    """http://stackoverflow.com/a/30923963/2946714"""
    if root:
        return {r.tag : dictify(r, False)}
    d=copy(r.attrib)
    if r.text:
        d["_text"]=r.text
    for x in r.findall("./*"):
        if x.tag not in d:
            d[x.tag]=[]
        d[x.tag].append(dictify(x,False))
    return d

def _get_video_query_url(resourceID):
    # has to be like this
    headers = {
        'DNT': '1',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-CA,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.47 Safari/537.36',
        'Accept': '*/*',
        'Referer': 'http://v.ucas.ac.cn/',
        'Connection': 'keep-alive',
    }
    params = (
        ('method', 'query'),
        ('loginname', 'videocas'),
        ('pwd', 'af1c7a4c5f77f790722f7cae474c37e281203765d423a23b'),
        ('resource', '[{"resourceID":"' + resourceID + '","on":1,"time":60000,"eid":100,"w":800,"h":600}]'),
        ('timeStamp', '{timeStamp}'.format(timeStamp = int(time()))),
    )
    a = requests.get('http://210.76.211.10/vplus/remote.do', headers=headers, params=params)
    info =  a.content.decode('utf-8')
    return match1(info, r'video":"(.+)"')

def _get_virtualPath(video_query_url):
    #getResourceJsCode2
    html = requests.get(video_query_url)
    html =  html.content.decode('utf-8')
    
    return match1(html, r"function\s+getVirtualPath\(\)\s+{\s+return\s+'(\w+)'")


def _get_video_list(resourceID):
    """"""
    video_xml = requests.get('http://210.76.211.10/vplus/member/resource.do?isyulan=0&method=queryFlashXmlByResourceId&resourceId={resourceID}&randoms={randoms}'.format(resourceID = resourceID,
                                                                                                                                                                         randoms = random()))    
    video_xml = video_xml.content.decode('utf-8')

    root = ET.fromstring(video_xml.split('___!!!___')[0])

    r = dictify(root)

    huge_list = []
    # main
    huge_list.append([i['value'] for i in sorted(r['video']['mainUrl'][0]['_flv'][0]['part'][0]['video'], key=lambda k: int(k['index']))])

    # sub
    if '_flv' in r['video']['subUrl'][0]:
        huge_list.append([i['value'] for i in sorted(r['video']['subUrl'][0]['_flv'][0]['part'][0]['video'], key=lambda k: int(k['index']))])

    return huge_list

def _ucas_get_url_lists_by_resourceID(resourceID):
    video_query_url = _get_video_query_url(resourceID)
    assert video_query_url != '', 'Cannot find video GUID!'
    
    virtualPath = _get_virtualPath(video_query_url)
    assert virtualPath != '', 'Cannot find virtualPath!'
    
    url_lists = _get_video_list(resourceID)
    assert url_lists, 'Cannot find any URL to download!'

    # make real url
    # credit to a mate in UCAS
    for video_type_id, video_urls in enumerate(url_lists):
        for k, path in enumerate(video_urls):
            url_lists[video_type_id][k] = 'http://210.76.211.10/vplus/member/resource.do?virtualPath={virtualPath}&method=getImgByStream&imgPath={path}'.format(virtualPath = virtualPath,
                                                                                                                                                                path = path)

    return url_lists

def ucas_download_single(url, output_dir = '.', merge = False, info_only = False, **kwargs):
    '''video page'''
    html = get_content(url)
    # resourceID is UUID
    resourceID = re.findall( r'resourceID":"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', html)[0]
    assert resourceID != '', 'Cannot find resourceID!'

    title = match1(html, r'<div class="bc-h">(.+)</div>')
    url_lists = _ucas_get_url_lists_by_resourceID(resourceID)
    assert url_lists, 'Cannot find any URL of such class!'
    
    for k, part in enumerate(url_lists):
        part_title = title + '_' + str(k)
        print_info(site_info, part_title, 'flv', 0)
        if not info_only:
            download_urls(part, part_title, 'flv', total_size=None, output_dir=output_dir, merge=merge)

def ucas_download_playlist(url, output_dir = '.', merge = False, info_only = False, **kwargs):
    '''course page'''
    html = get_content(url)

    parts = re.findall( r'(getplaytitle.do\?.+)"', html)
    assert parts, 'No part found!'

    for part_path in parts:
        ucas_download('http://v.ucas.ac.cn/course/' + part_path, output_dir=output_dir, merge=merge, info_only=info_only)

def ucas_download(url, output_dir = '.', merge = False, info_only = False, **kwargs):
    if 'classid=' in url and 'getplaytitle.do' in url:
        ucas_download_single(url, output_dir=output_dir, merge=merge, info_only=info_only)
    elif 'CourseIndex.do' in url:
        ucas_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only)

site_info = "UCAS"
download = ucas_download
download_playlist = ucas_download_playlist
[UCAS] add support 2017-05-14 00:15:29 +03:00			`#!/usr/bin/env python`

			`__all__ = ['ucas_download', 'ucas_download_single', 'ucas_download_playlist']`

			`from ..common import *`
			`import urllib.error`
			`import requests`
			`from time import time`
			`from random import random`
			`import xml.etree.ElementTree as ET`
			`from copy import copy`

			`"""`
			`Do not replace request.get with get_content`
			`for UCAS's server is not correctly returning data!`
			`"""`

			`def dictify(r,root=True):`
			`"""http://stackoverflow.com/a/30923963/2946714"""`
			`if root:`
			`return {r.tag : dictify(r, False)}`
			`d=copy(r.attrib)`
			`if r.text:`
			`d["_text"]=r.text`
			`for x in r.findall("./*"):`
			`if x.tag not in d:`
			`d[x.tag]=[]`
			`d[x.tag].append(dictify(x,False))`
			`return d`

			`def _get_video_query_url(resourceID):`
			`# has to be like this`
			`headers = {`
			`'DNT': '1',`
			`'Accept-Encoding': 'gzip, deflate',`
			`'Accept-Language': 'en-CA,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2',`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.47 Safari/537.36',`
			`'Accept': '/',`
			`'Referer': 'http://v.ucas.ac.cn/',`
			`'Connection': 'keep-alive',`
			`}`
			`params = (`
			`('method', 'query'),`
			`('loginname', 'videocas'),`
			`('pwd', 'af1c7a4c5f77f790722f7cae474c37e281203765d423a23b'),`
			`('resource', '[{"resourceID":"' + resourceID + '","on":1,"time":60000,"eid":100,"w":800,"h":600}]'),`
			`('timeStamp', '{timeStamp}'.format(timeStamp = int(time()))),`
			`)`
			`a = requests.get('http://210.76.211.10/vplus/remote.do', headers=headers, params=params)`
			`info = a.content.decode('utf-8')`
			`return match1(info, r'video":"(.+)"')`

			`def _get_virtualPath(video_query_url):`
			`#getResourceJsCode2`
			`html = requests.get(video_query_url)`
			`html = html.content.decode('utf-8')`

			`return match1(html, r"function\s+getVirtualPath\(\)\s+{\s+return\s+'(\w+)'")`



			`def _get_video_list(resourceID):`
			`""""""`
			`video_xml = requests.get('http://210.76.211.10/vplus/member/resource.do?isyulan=0&method=queryFlashXmlByResourceId&resourceId={resourceID}&randoms={randoms}'.format(resourceID = resourceID,`
			`randoms = random()))`
			`video_xml = video_xml.content.decode('utf-8')`

			`root = ET.fromstring(video_xml.split('___!!!___')[0])`

			`r = dictify(root)`

			`huge_list = []`
			`# main`
			`huge_list.append([i['value'] for i in sorted(r['video']['mainUrl'][0]['_flv'][0]['part'][0]['video'], key=lambda k: int(k['index']))])`

			`# sub`
			`if '_flv' in r['video']['subUrl'][0]:`
			`huge_list.append([i['value'] for i in sorted(r['video']['subUrl'][0]['_flv'][0]['part'][0]['video'], key=lambda k: int(k['index']))])`

			`return huge_list`

			`def _ucas_get_url_lists_by_resourceID(resourceID):`
			`video_query_url = _get_video_query_url(resourceID)`
			`assert video_query_url != '', 'Cannot find video GUID!'`

			`virtualPath = _get_virtualPath(video_query_url)`
			`assert virtualPath != '', 'Cannot find virtualPath!'`

			`url_lists = _get_video_list(resourceID)`
			`assert url_lists, 'Cannot find any URL to download!'`

			`# make real url`
			`# credit to a mate in UCAS`
			`for video_type_id, video_urls in enumerate(url_lists):`
			`for k, path in enumerate(video_urls):`
			`url_lists[video_type_id][k] = 'http://210.76.211.10/vplus/member/resource.do?virtualPath={virtualPath}&method=getImgByStream&imgPath={path}'.format(virtualPath = virtualPath,`
			`path = path)`

			`return url_lists`

			`def ucas_download_single(url, output_dir = '.', merge = False, info_only = False, **kwargs):`
			`'''video page'''`
			`html = get_content(url)`
			`# resourceID is UUID`
			`resourceID = re.findall( r'resourceID":"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', html)[0]`
			`assert resourceID != '', 'Cannot find resourceID!'`

			`title = match1(html, r'<div class="bc-h">(.+)</div>')`
			`url_lists = _ucas_get_url_lists_by_resourceID(resourceID)`
			`assert url_lists, 'Cannot find any URL of such class!'`

			`for k, part in enumerate(url_lists):`
			`part_title = title + '_' + str(k)`
			`print_info(site_info, part_title, 'flv', 0)`
			`if not info_only:`
			`download_urls(part, part_title, 'flv', total_size=None, output_dir=output_dir, merge=merge)`

			`def ucas_download_playlist(url, output_dir = '.', merge = False, info_only = False, **kwargs):`
			`'''course page'''`
			`html = get_content(url)`

			`parts = re.findall( r'(getplaytitle.do\?.+)"', html)`
			`assert parts, 'No part found!'`

			`for part_path in parts:`
			`ucas_download('http://v.ucas.ac.cn/course/' + part_path, output_dir=output_dir, merge=merge, info_only=info_only)`

			`def ucas_download(url, output_dir = '.', merge = False, info_only = False, **kwargs):`
			`if 'classid=' in url and 'getplaytitle.do' in url:`
			`ucas_download_single(url, output_dir=output_dir, merge=merge, info_only=info_only)`
			`elif 'CourseIndex.do' in url:`
			`ucas_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only)`

			`site_info = "UCAS"`
			`download = ucas_download`
			`download_playlist = ucas_download_playlist`