mirror of
https://github.com/soimort/you-get.git
synced 2025-02-09 11:42:30 +03:00
Merge branch 'flickr_extractor_dev' of https://github.com/YenvY/you-get into YenvY-flickr_extractor_dev
This commit is contained in:
commit
8e40b66a7e
@ -1,39 +1,228 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
__all__ = ['flickr_download']
|
__all__ = ['flickr_download_main']
|
||||||
|
|
||||||
from ..common import *
|
from ..common import *
|
||||||
|
|
||||||
def flickr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
import json
|
||||||
page = get_html(url)
|
|
||||||
title = match1(page, r'<meta property="og:title" content="([^"]*)"')
|
|
||||||
photo_id = match1(page, r'"id":"([0-9]+)"')
|
|
||||||
|
|
||||||
try: # extract video
|
pattern_url_photoset = r'https?://www\.flickr\.com/photos/.+/(?:(?:sets)|(?:albums))?/([^/]+)'
|
||||||
html = get_html('https://secure.flickr.com/apps/video/video_mtl_xml.gne?photo_id=%s' % photo_id)
|
pattern_url_photostream = r'https?://www\.flickr\.com/photos/([^/]+)(?:/|(?:/page))?$'
|
||||||
node_id = match1(html, r'<Item id="id">(.+)</Item>')
|
pattern_url_single_photo = r'https?://www\.flickr\.com/photos/[^/]+/(\d+)'
|
||||||
secret = match1(html, r'<Item id="photo_secret">(.+)</Item>')
|
pattern_url_gallery = r'https?://www\.flickr\.com/photos/[^/]+/galleries/(\d+)'
|
||||||
|
pattern_url_group = r'https?://www\.flickr\.com/groups/([^/]+)'
|
||||||
|
pattern_url_favorite = r'https?://www\.flickr\.com/photos/([^/]+)/favorites'
|
||||||
|
|
||||||
html = get_html('https://secure.flickr.com/video_playlist.gne?node_id=%s&secret=%s' % (node_id, secret))
|
pattern_inline_title = r'<title>([^<]*)</title>'
|
||||||
app = match1(html, r'APP="([^"]+)"')
|
pattern_inline_api_key = r'api\.site_key\s*=\s*"([^"]+)"'
|
||||||
fullpath = unescape_html(match1(html, r'FULLPATH="([^"]+)"'))
|
pattern_inline_img_url = r'"url":"([^"]+)","key":"[^"]+"}}'
|
||||||
url = app + fullpath
|
pattern_inline_NSID = r'"nsid"\s*:\s*"([^"]+)"'
|
||||||
|
pattern_inline_video_mark = r'("mediaType":"video")'
|
||||||
|
|
||||||
|
# (api_key, method, ext, page)
|
||||||
|
tmpl_api_call = (
|
||||||
|
'https://api.flickr.com/services/rest?'
|
||||||
|
'&format=json&nojsoncallback=1'
|
||||||
|
# UNCOMMENT FOR TESTING
|
||||||
|
#'&per_page=5'
|
||||||
|
'&per_page=500'
|
||||||
|
# this parameter CANNOT take control of 'flickr.galleries.getPhotos'
|
||||||
|
# though the doc said it should.
|
||||||
|
# it's always considered to be 500
|
||||||
|
'&api_key=%s'
|
||||||
|
'&method=flickr.%s'
|
||||||
|
'&extras=url_sq,url_q,url_t,url_s,url_n,url_m,url_z,url_c,url_l,url_h,url_k,url_o,media'
|
||||||
|
'%s&page=%d'
|
||||||
|
)
|
||||||
|
|
||||||
|
tmpl_api_call_video_info = (
|
||||||
|
'https://api.flickr.com/services/rest?'
|
||||||
|
'&format=json&nojsoncallback=1'
|
||||||
|
'&method=flickr.video.getStreamInfo'
|
||||||
|
'&api_key=%s'
|
||||||
|
'&photo_id=%s'
|
||||||
|
'&secret=%s'
|
||||||
|
)
|
||||||
|
|
||||||
|
tmpl_api_call_photo_info = (
|
||||||
|
'https://api.flickr.com/services/rest?'
|
||||||
|
'&format=json&nojsoncallback=1'
|
||||||
|
'&method=flickr.photos.getInfo'
|
||||||
|
'&api_key=%s'
|
||||||
|
'&photo_id=%s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# looks that flickr won't return urls for all sizes
|
||||||
|
# we required in 'extras field without a acceptable header
|
||||||
|
dummy_header = {
|
||||||
|
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'
|
||||||
|
}
|
||||||
|
def get_content_headered(url):
|
||||||
|
return get_content(url, dummy_header)
|
||||||
|
|
||||||
|
def get_photoset_id(url, page):
|
||||||
|
return match1(url, pattern_url_photoset)
|
||||||
|
|
||||||
|
def get_photo_id(url, page):
|
||||||
|
return match1(url, pattern_url_single_photo)
|
||||||
|
|
||||||
|
def get_gallery_id(url, page):
|
||||||
|
return match1(url, pattern_url_gallery)
|
||||||
|
|
||||||
|
def get_api_key(page):
|
||||||
|
match = match1(page, pattern_inline_api_key)
|
||||||
|
# this happens only when the url points to a gallery page
|
||||||
|
# that contains no inline api_key(and never makes xhr api calls)
|
||||||
|
# in fact this might be a better approch for getting a temporary api key
|
||||||
|
# since there's no place for a user to add custom infomation that may
|
||||||
|
# misguide the regex in the homepage
|
||||||
|
if not match:
|
||||||
|
return match1(get_html('https://flickr.com'), pattern_inline_api_key)
|
||||||
|
return match
|
||||||
|
|
||||||
|
def get_NSID(url, page):
|
||||||
|
return match1(page, pattern_inline_NSID)
|
||||||
|
|
||||||
|
# [
|
||||||
|
# (
|
||||||
|
# regex_match_url,
|
||||||
|
# remote_api_method,
|
||||||
|
# additional_query_parameter_for_method,
|
||||||
|
# parser_for_additional_parameter,
|
||||||
|
# field_where_photourls_are_saved
|
||||||
|
# )
|
||||||
|
# ]
|
||||||
|
url_patterns = [
|
||||||
|
# www.flickr.com/photos/{username|NSID}/sets|albums/{album-id}
|
||||||
|
(
|
||||||
|
pattern_url_photoset,
|
||||||
|
'photosets.getPhotos',
|
||||||
|
'photoset_id',
|
||||||
|
get_photoset_id,
|
||||||
|
'photoset'
|
||||||
|
),
|
||||||
|
# www.flickr.com/photos/{username|NSID}/{pageN}?
|
||||||
|
(
|
||||||
|
pattern_url_photostream,
|
||||||
|
# according to flickr api documentation, this method needs to be
|
||||||
|
# authenticated in order to filter photo visible to the calling user
|
||||||
|
# but it seems works fine anonymously as well
|
||||||
|
'people.getPhotos',
|
||||||
|
'user_id',
|
||||||
|
get_NSID,
|
||||||
|
'photos'
|
||||||
|
),
|
||||||
|
# www.flickr.com/photos/{username|NSID}/galleries/{gallery-id}
|
||||||
|
(
|
||||||
|
pattern_url_gallery,
|
||||||
|
'galleries.getPhotos',
|
||||||
|
'gallery_id',
|
||||||
|
get_gallery_id,
|
||||||
|
'photos'
|
||||||
|
),
|
||||||
|
# www.flickr.com/groups/{groupname|groupNSID}/
|
||||||
|
(
|
||||||
|
pattern_url_group,
|
||||||
|
'groups.pools.getPhotos',
|
||||||
|
'group_id',
|
||||||
|
get_NSID,
|
||||||
|
'photos'
|
||||||
|
),
|
||||||
|
# www.flickr.com/photos/{username|NSID}/favorites/*
|
||||||
|
(
|
||||||
|
pattern_url_favorite,
|
||||||
|
'favorites.getList',
|
||||||
|
'user_id',
|
||||||
|
get_NSID,
|
||||||
|
'photos'
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def flickr_download_main(url, output_dir = '.', merge = False, info_only = False, **kwargs):
|
||||||
|
urls = None
|
||||||
|
size = 'o' # works for collections only
|
||||||
|
title = None
|
||||||
|
if 'stream_id' in kwargs:
|
||||||
|
size = kwargs['stream_id']
|
||||||
|
if match1(url, pattern_url_single_photo):
|
||||||
|
url, title = get_single_photo_url(url)
|
||||||
|
urls = [url]
|
||||||
|
else:
|
||||||
|
urls, title = fetch_photo_url_list(url, size)
|
||||||
|
index = 0
|
||||||
|
for url in urls:
|
||||||
mime, ext, size = url_info(url)
|
mime, ext, size = url_info(url)
|
||||||
|
print_info('Flickr.com', title, mime, size)
|
||||||
print_info(site_info, title, mime, size)
|
|
||||||
if not info_only:
|
if not info_only:
|
||||||
download_urls([url], title, ext, size, output_dir, merge=merge, faker=True)
|
suffix = '[%d]' % index
|
||||||
|
download_urls([url], title + suffix, ext, False, output_dir, None, False, False)
|
||||||
|
index = index + 1
|
||||||
|
|
||||||
except: # extract images
|
def fetch_photo_url_list(url, size):
|
||||||
image = match1(page, r'<meta property="og:image" content="([^"]*)')
|
for pattern in url_patterns:
|
||||||
ext = 'jpg'
|
# FIXME: fix multiple matching since the match group is dropped
|
||||||
_, _, size = url_info(image)
|
if match1(url, pattern[0]):
|
||||||
|
return fetch_photo_url_list_impl(url, size, *pattern[1:])
|
||||||
|
raise NotImplementedError('Flickr extractor is not supported for %s.' % url)
|
||||||
|
|
||||||
print_info(site_info, title, ext, size)
|
def fetch_photo_url_list_impl(url, size, method, id_field, id_parse_func, collection_name):
|
||||||
if not info_only:
|
page = get_html(url)
|
||||||
download_urls([image], title, ext, size, output_dir, merge=merge)
|
api_key = get_api_key(page)
|
||||||
|
ext_field = ''
|
||||||
|
if id_parse_func:
|
||||||
|
ext_field = '&%s=%s' % (id_field, id_parse_func(url, page))
|
||||||
|
page_number = 1
|
||||||
|
urls = []
|
||||||
|
while True:
|
||||||
|
call_url = tmpl_api_call % (api_key, method, ext_field, page_number)
|
||||||
|
photoset = json.loads(get_content_headered(call_url))[collection_name]
|
||||||
|
pagen = photoset['page']
|
||||||
|
pages = photoset['pages']
|
||||||
|
for info in photoset['photo']:
|
||||||
|
url = get_url_of_largest(info, api_key, size)
|
||||||
|
urls.append(url)
|
||||||
|
page_number = page_number + 1
|
||||||
|
# the typeof 'page' and 'pages' may change in different methods
|
||||||
|
if str(pagen) == str(pages):
|
||||||
|
break
|
||||||
|
return urls, match1(page, pattern_inline_title)
|
||||||
|
|
||||||
|
# image size suffixes used in inline json 'key' field
|
||||||
|
# listed in descending order
|
||||||
|
size_suffixes = ['o', 'k', 'h', 'l', 'c', 'z', 'm', 'n', 's', 't', 'q', 'sq']
|
||||||
|
|
||||||
|
def get_orig_video_source(api_key, pid, secret):
|
||||||
|
parsed = json.loads(get_content_headered(tmpl_api_call_video_info % (api_key, pid, secret)))
|
||||||
|
for stream in parsed['streams']['stream']:
|
||||||
|
if stream['type'] == 'orig':
|
||||||
|
return stream['_content'].replace('\\', '')
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_url_of_largest(info, api_key, size):
|
||||||
|
if info['media'] == 'photo':
|
||||||
|
sizes = size_suffixes
|
||||||
|
if size in sizes:
|
||||||
|
sizes = sizes[sizes.index(size):]
|
||||||
|
for suffix in sizes:
|
||||||
|
if 'url_' + suffix in info:
|
||||||
|
return info['url_' + suffix].replace('\\', '')
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return get_orig_video_source(api_key, info['id'], info['secret'])
|
||||||
|
|
||||||
|
def get_single_photo_url(url):
|
||||||
|
page = get_html(url)
|
||||||
|
pid = get_photo_id(url, page)
|
||||||
|
title = match1(page, pattern_inline_title)
|
||||||
|
if match1(page, pattern_inline_video_mark):
|
||||||
|
api_key = get_api_key(page)
|
||||||
|
reply = get_content(tmpl_api_call_photo_info % (api_key, get_photo_id(url, page)))
|
||||||
|
secret = json.loads(reply)['photo']['secret']
|
||||||
|
return get_orig_video_source(api_key, pid, secret), title
|
||||||
|
#last match always has the best resolution
|
||||||
|
match = match1(page, pattern_inline_img_url)
|
||||||
|
return 'https:' + match.replace('\\', ''), title
|
||||||
|
|
||||||
site_info = "Flickr.com"
|
site_info = "Flickr.com"
|
||||||
download = flickr_download
|
download = flickr_download_main
|
||||||
download_playlist = playlist_not_supported('flickr')
|
download_playlist = playlist_not_supported('flickr');
|
||||||
|
Loading…
Reference in New Issue
Block a user