From f8a5c23356ec8208feaa87eeb581337a4ba7b89e Mon Sep 17 00:00:00 2001 From: huangc Date: Fri, 6 Dec 2019 20:10:01 +0800 Subject: [PATCH] joint headers into common.py --- src/you_get/common.py | 10 +++++++++- src/you_get/extractors/bilibili.py | 5 +++-- src/you_get/extractors/flickr.py | 7 ++++--- src/you_get/extractors/miaopai.py | 8 -------- src/you_get/extractors/missevan.py | 8 +++----- src/you_get/extractors/netease.py | 21 ++++++++++++--------- src/you_get/extractors/pptv.py | 4 ++-- src/you_get/extractors/showroom.py | 7 ------- 8 files changed, 33 insertions(+), 37 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 2397a0a6..361e58f3 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -147,6 +147,14 @@ fake_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa } +fake_headers_mobile = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36' +} + if sys.stdout.isatty(): default_encoding = sys.stdout.encoding.lower() else: @@ -413,7 +421,7 @@ def urlopen_with_retry(*args, **kwargs): raise http_error -def get_content(url, headers={}, decoded=True): +def get_content(url, headers=fake_headers, decoded=True): """Gets the content of a URL via sending a HTTP GET request. Args: diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index e5ddbafc..5dd55fc2 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -44,8 +44,9 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_headers(referer=None, cookie=None): # a reasonable UA - ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' - headers = {'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'User-Agent': ua} + #ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' + #headers = {'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'User-Agent': ua} + headers = fake_headers if referer is not None: headers.update({'Referer': referer}) if cookie is not None: diff --git a/src/you_get/extractors/flickr.py b/src/you_get/extractors/flickr.py index 2535dd1c..9d42c75c 100644 --- a/src/you_get/extractors/flickr.py +++ b/src/you_get/extractors/flickr.py @@ -54,9 +54,10 @@ tmpl_api_call_photo_info = ( # looks that flickr won't return urls for all sizes # we required in 'extras field without a acceptable header -dummy_header = { - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0' -} +#dummy_header = { +# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0' +#} +dummy_header = fake_headers def get_content_headered(url): return get_content(url, dummy_header) diff --git a/src/you_get/extractors/miaopai.py b/src/you_get/extractors/miaopai.py index 0ddcadba..62c46cd3 100644 --- a/src/you_get/extractors/miaopai.py +++ b/src/you_get/extractors/miaopai.py @@ -9,14 +9,6 @@ import urllib.error import urllib.parse from ..util import fs -fake_headers_mobile = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Charset': 'UTF-8,*;q=0.5', - 'Accept-Encoding': 'gzip,deflate,sdch', - 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36' -} - def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = False, **kwargs): '''Source: Android mobile''' page_url = 'http://video.weibo.com/show?fid=' + fid + '&type=mp4' diff --git a/src/you_get/extractors/missevan.py b/src/you_get/extractors/missevan.py index c2b25a90..5deb061c 100644 --- a/src/you_get/extractors/missevan.py +++ b/src/you_get/extractors/missevan.py @@ -26,7 +26,7 @@ import json import os import re -from ..common import get_content, urls_size, log, player, dry_run +from ..common import get_content, urls_size, log, player, dry_run, fake_headers from ..extractor import VideoExtractor _UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' \ @@ -125,7 +125,6 @@ class MissEvanWithStream(VideoExtractor): def __init__(self, *args): super().__init__(*args) self.referer = 'https://www.missevan.com/' - self.ua = _UA @classmethod def create(cls, title, streams, *, streams_sorted=None): @@ -175,8 +174,7 @@ class MissEvan(VideoExtractor): def __init__(self, *args): super().__init__(*args) self.referer = 'https://www.missevan.com/' - self.ua = _UA - self.__headers = {'User-Agent': self.ua, 'Referer': self.referer} + self.__headers = {'User-Agent': fake_headers['User-Agent'], 'Referer': self.referer} __prepare_dispatcher = _Dispatcher() @@ -326,7 +324,7 @@ class MissEvan(VideoExtractor): stream['size'] = urls_size(stream['src']) def _get_content(self, url): - return get_content(url, headers=self.__headers) + return get_content(url) def _get_json(self, url): content = self._get_content(url) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index f74747b1..53a0307e 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -11,6 +11,9 @@ import hashlib import base64 import os +local_header = fake_headers.copy() +local_header["Referer"] = "http://music.163.com/" + def netease_hymn(): return """ player's Game Over, @@ -26,7 +29,7 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals if rid is None: rid = match1(url, r'/(\d+)/?') if "album" in url: - j = loads(get_content("http://music.163.com/api/album/%s?id=%s&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) + j = loads(get_content("http://music.163.com/api/album/%s?id=%s&csrf_token=" % (rid, rid), headers=local_header)) artist_name = j['album']['artists'][0]['name'] album_name = j['album']['name'].strip() @@ -41,12 +44,12 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals netease_song_download(i, output_dir=new_dir, info_only=info_only) try: # download lyrics assert kwargs['caption'] - l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"})) + l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers=local_header)) netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only) except: pass elif "playlist" in url: - j = loads(get_content("http://music.163.com/api/playlist/detail?id=%s&csrf_token=" % rid, headers={"Referer": "http://music.163.com/"})) + j = loads(get_content("http://music.163.com/api/playlist/detail?id=%s&csrf_token=" % rid, headers=local_header)) new_dir = output_dir + '/' + fs.legitimize(j['result']['name']) if not info_only: @@ -61,30 +64,30 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals netease_song_download(i, output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix) try: # download lyrics assert kwargs['caption'] - l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"})) + l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers=local_header)) netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix) except: pass elif "song" in url: - j = loads(get_content("http://music.163.com/api/song/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) + j = loads(get_content("http://music.163.com/api/song/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers=local_header)) netease_song_download(j["songs"][0], output_dir=output_dir, info_only=info_only) try: # download lyrics assert kwargs['caption'] - l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % rid, headers={"Referer": "http://music.163.com/"})) + l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % rid, headers=local_header)) netease_lyric_download(j["songs"][0], l["lrc"]["lyric"], output_dir=output_dir, info_only=info_only) except: pass elif "program" in url: - j = loads(get_content("http://music.163.com/api/dj/program/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) + j = loads(get_content("http://music.163.com/api/dj/program/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers=local_header)) netease_song_download(j["program"]["mainSong"], output_dir=output_dir, info_only=info_only) elif "radio" in url: - j = loads(get_content("http://music.163.com/api/dj/program/byradio/?radioId=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) + j = loads(get_content("http://music.163.com/api/dj/program/byradio/?radioId=%s&ids=[%s]&csrf_token=" % (rid, rid), headers=local_header)) for i in j['programs']: netease_song_download(i["mainSong"],output_dir=output_dir, info_only=info_only) elif "mv" in url: - j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) + j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers=local_header)) netease_video_download(j['data'], output_dir=output_dir, info_only=info_only) def netease_lyric_download(song, lyric, output_dir='.', info_only=False, playlist_prefix=""): diff --git a/src/you_get/extractors/pptv.py b/src/you_get/extractors/pptv.py index dacd78e4..5166cf63 100644 --- a/src/you_get/extractors/pptv.py +++ b/src/you_get/extractors/pptv.py @@ -192,14 +192,14 @@ class PPTV(VideoExtractor): if self.url and not self.vid: if not re.match(r'https?://v.pptv.com/show/(\w+)\.html', self.url): raise('Unknown url pattern') - page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}) + page_content = get_content(self.url) self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)') if not self.vid: raise('Cannot find id') api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid) api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4' - dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})) + dom = parseString(get_content(api_url)) self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom) xml_streams = merge_meta(m_items, m_streams, m_segs) for stream_id in xml_streams: diff --git a/src/you_get/extractors/showroom.py b/src/you_get/extractors/showroom.py index 606dc806..736ac51c 100644 --- a/src/you_get/extractors/showroom.py +++ b/src/you_get/extractors/showroom.py @@ -10,13 +10,6 @@ from time import time, sleep #---------------------------------------------------------------------- def showroom_get_roomid_by_room_url_key(room_url_key): """str->str""" - fake_headers_mobile = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Charset': 'UTF-8,*;q=0.5', - 'Accept-Encoding': 'gzip,deflate,sdch', - 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36' - } webpage_url = 'https://www.showroom-live.com/' + room_url_key html = get_content(webpage_url, headers = fake_headers_mobile) roomid = match1(html, r'room\?room_id\=(\d+)')