From 91b196baef0414858a2614d64b32269e1a7f4c78 Mon Sep 17 00:00:00 2001 From: YenvY Date: Wed, 14 Jun 2017 09:06:51 +0800 Subject: [PATCH] rebuild flickr extractor --- src/you_get/extractors/flickr.py | 239 +++++++++++++++++++++++++++---- 1 file changed, 214 insertions(+), 25 deletions(-) diff --git a/src/you_get/extractors/flickr.py b/src/you_get/extractors/flickr.py index b0f102f6..8d55a896 100644 --- a/src/you_get/extractors/flickr.py +++ b/src/you_get/extractors/flickr.py @@ -1,39 +1,228 @@ #!/usr/bin/env python -__all__ = ['flickr_download'] +__all__ = ['flickr_download_main'] from ..common import * -def flickr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - page = get_html(url) - title = match1(page, r'(.+)') - secret = match1(html, r'(.+)') +pattern_url_photoset = r'https?://www\.flickr\.com/photos/.+/(?:(?:sets)|(?:albums))?/([^/]+)' +pattern_url_photostream = r'https?://www\.flickr\.com/photos/([^/]+)(?:/|(?:/page))?$' +pattern_url_single_photo = r'https?://www\.flickr\.com/photos/[^/]+/(\d+)' +pattern_url_gallery = r'https?://www\.flickr\.com/photos/[^/]+/galleries/(\d+)' +pattern_url_group = r'https?://www\.flickr\.com/groups/([^/]+)' +pattern_url_favorite = r'https?://www\.flickr\.com/photos/([^/]+)/favorites' - html = get_html('https://secure.flickr.com/video_playlist.gne?node_id=%s&secret=%s' % (node_id, secret)) - app = match1(html, r'APP="([^"]+)"') - fullpath = unescape_html(match1(html, r'FULLPATH="([^"]+)"')) - url = app + fullpath +pattern_inline_title = r'([^<]*)' +pattern_inline_api_key = r'api\.site_key\s*=\s*"([^"]+)"' +pattern_inline_img_url = r'"url":"([^"]+)","key":"[^"]+"}}' +pattern_inline_NSID = r'"nsid"\s*:\s*"([^"]+)"' +pattern_inline_video_mark = r'("mediaType":"video")' +# (api_key, method, ext, page) +tmpl_api_call = ( + 'https://api.flickr.com/services/rest?' + '&format=json&nojsoncallback=1' + # UNCOMMENT FOR TESTING + #'&per_page=5' + '&per_page=500' + # this parameter CANNOT take control of 'flickr.galleries.getPhotos' + # though the doc said it should. + # it's always considered to be 500 + '&api_key=%s' + '&method=flickr.%s' + '&extras=url_sq,url_q,url_t,url_s,url_n,url_m,url_z,url_c,url_l,url_h,url_k,url_o,media' + '%s&page=%d' +) + +tmpl_api_call_video_info = ( + 'https://api.flickr.com/services/rest?' + '&format=json&nojsoncallback=1' + '&method=flickr.video.getStreamInfo' + '&api_key=%s' + '&photo_id=%s' + '&secret=%s' +) + +tmpl_api_call_photo_info = ( + 'https://api.flickr.com/services/rest?' + '&format=json&nojsoncallback=1' + '&method=flickr.photos.getInfo' + '&api_key=%s' + '&photo_id=%s' +) + +# looks that flickr won't return urls for all sizes +# we required in 'extras field without a acceptable header +dummy_header = { + 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0' +} +def get_content_headered(url): + return get_content(url, dummy_header) + +def get_photoset_id(url, page): + return match1(url, pattern_url_photoset) + +def get_photo_id(url, page): + return match1(url, pattern_url_single_photo) + +def get_gallery_id(url, page): + return match1(url, pattern_url_gallery) + +def get_api_key(page): + match = match1(page, pattern_inline_api_key) + # this happens only when the url points to a gallery page + # that contains no inline api_key(and never makes xhr api calls) + # in fact this might be a better approch for getting a temporary api key + # since there's no place for a user to add custom infomation that may + # misguide the regex in the homepage + if not match: + return match1(get_html('https://flickr.com'), r'"site_key"\s*:\s*"([^"]+)"') + return match + +def get_NSID(url, page): + return match1(page, pattern_inline_NSID) + +# [ +# ( +# regex_match_url, +# remote_api_method, +# additional_query_parameter_for_method, +# parser_for_additional_parameter, +# field_where_photourls_are_saved +# ) +# ] +url_patterns = [ + # www.flickr.com/photos/{username|NSID}/sets|albums/{album-id} + ( + pattern_url_photoset, + 'photosets.getPhotos', + 'photoset_id', + get_photoset_id, + 'photoset' + ), + # www.flickr.com/photos/{username|NSID}/{pageN}? + ( + pattern_url_photostream, + # according to flickr api documentation, this method needs to be + # authenticated in order to filter photo visible to the calling user + # but it seems works fine anonymously as well + 'people.getPhotos', + 'user_id', + get_NSID, + 'photos' + ), + # www.flickr.com/photos/{username|NSID}/galleries/{gallery-id} + ( + pattern_url_gallery, + 'galleries.getPhotos', + 'gallery_id', + get_gallery_id, + 'photos' + ), + # www.flickr.com/groups/{groupname|groupNSID}/ + ( + pattern_url_group, + 'groups.pools.getPhotos', + 'group_id', + get_NSID, + 'photos' + ), + # www.flickr.com/photos/{username|NSID}/favorites/* + ( + pattern_url_favorite, + 'favorites.getList', + 'user_id', + get_NSID, + 'photos' + ) +] + +def flickr_download_main(url, output_dir = '.', merge = False, info_only = False, **kwargs): + urls = None + size = 'o' # works for collections only + title = None + if 'stream_id' in kwargs: + size = kwargs['stream_id'] + if match1(url, pattern_url_single_photo): + url, title = get_single_photo_url(url) + urls = [url] + else: + urls, title = fetch_photo_url_list(url, size) + index = 0 + for url in urls: mime, ext, size = url_info(url) - - print_info(site_info, title, mime, size) + print_info('Flickr.com', title, mime, size) if not info_only: - download_urls([url], title, ext, size, output_dir, merge=merge, faker=True) + suffix = '[%d]' % index + download_urls([url], title + suffix, ext, False, output_dir, None, False, False) + index = index + 1 - except: # extract images - image = match1(page, r'