diff --git a/src/you_get/extractors/flickr.py b/src/you_get/extractors/flickr.py
index b0f102f6..4efa78ef 100644
--- a/src/you_get/extractors/flickr.py
+++ b/src/you_get/extractors/flickr.py
@@ -1,39 +1,228 @@
#!/usr/bin/env python
-__all__ = ['flickr_download']
+__all__ = ['flickr_download_main']
from ..common import *
-def flickr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- page = get_html(url)
- title = match1(page, r'(.+)')
- secret = match1(html, r'- (.+)
')
+pattern_url_photoset = r'https?://www\.flickr\.com/photos/.+/(?:(?:sets)|(?:albums))?/([^/]+)'
+pattern_url_photostream = r'https?://www\.flickr\.com/photos/([^/]+)(?:/|(?:/page))?$'
+pattern_url_single_photo = r'https?://www\.flickr\.com/photos/[^/]+/(\d+)'
+pattern_url_gallery = r'https?://www\.flickr\.com/photos/[^/]+/galleries/(\d+)'
+pattern_url_group = r'https?://www\.flickr\.com/groups/([^/]+)'
+pattern_url_favorite = r'https?://www\.flickr\.com/photos/([^/]+)/favorites'
- html = get_html('https://secure.flickr.com/video_playlist.gne?node_id=%s&secret=%s' % (node_id, secret))
- app = match1(html, r'APP="([^"]+)"')
- fullpath = unescape_html(match1(html, r'FULLPATH="([^"]+)"'))
- url = app + fullpath
+pattern_inline_title = r'
([^<]*)'
+pattern_inline_api_key = r'api\.site_key\s*=\s*"([^"]+)"'
+pattern_inline_img_url = r'"url":"([^"]+)","key":"[^"]+"}}'
+pattern_inline_NSID = r'"nsid"\s*:\s*"([^"]+)"'
+pattern_inline_video_mark = r'("mediaType":"video")'
+# (api_key, method, ext, page)
+tmpl_api_call = (
+ 'https://api.flickr.com/services/rest?'
+ '&format=json&nojsoncallback=1'
+ # UNCOMMENT FOR TESTING
+ #'&per_page=5'
+ '&per_page=500'
+ # this parameter CANNOT take control of 'flickr.galleries.getPhotos'
+ # though the doc said it should.
+ # it's always considered to be 500
+ '&api_key=%s'
+ '&method=flickr.%s'
+ '&extras=url_sq,url_q,url_t,url_s,url_n,url_m,url_z,url_c,url_l,url_h,url_k,url_o,media'
+ '%s&page=%d'
+)
+
+tmpl_api_call_video_info = (
+ 'https://api.flickr.com/services/rest?'
+ '&format=json&nojsoncallback=1'
+ '&method=flickr.video.getStreamInfo'
+ '&api_key=%s'
+ '&photo_id=%s'
+ '&secret=%s'
+)
+
+tmpl_api_call_photo_info = (
+ 'https://api.flickr.com/services/rest?'
+ '&format=json&nojsoncallback=1'
+ '&method=flickr.photos.getInfo'
+ '&api_key=%s'
+ '&photo_id=%s'
+)
+
+# looks that flickr won't return urls for all sizes
+# we required in 'extras field without a acceptable header
+dummy_header = {
+ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'
+}
+def get_content_headered(url):
+ return get_content(url, dummy_header)
+
+def get_photoset_id(url, page):
+ return match1(url, pattern_url_photoset)
+
+def get_photo_id(url, page):
+ return match1(url, pattern_url_single_photo)
+
+def get_gallery_id(url, page):
+ return match1(url, pattern_url_gallery)
+
+def get_api_key(page):
+ match = match1(page, pattern_inline_api_key)
+ # this happens only when the url points to a gallery page
+ # that contains no inline api_key(and never makes xhr api calls)
+ # in fact this might be a better approch for getting a temporary api key
+ # since there's no place for a user to add custom infomation that may
+ # misguide the regex in the homepage
+ if not match:
+ return match1(get_html('https://flickr.com'), pattern_inline_api_key)
+ return match
+
+def get_NSID(url, page):
+ return match1(page, pattern_inline_NSID)
+
+# [
+# (
+# regex_match_url,
+# remote_api_method,
+# additional_query_parameter_for_method,
+# parser_for_additional_parameter,
+# field_where_photourls_are_saved
+# )
+# ]
+url_patterns = [
+ # www.flickr.com/photos/{username|NSID}/sets|albums/{album-id}
+ (
+ pattern_url_photoset,
+ 'photosets.getPhotos',
+ 'photoset_id',
+ get_photoset_id,
+ 'photoset'
+ ),
+ # www.flickr.com/photos/{username|NSID}/{pageN}?
+ (
+ pattern_url_photostream,
+ # according to flickr api documentation, this method needs to be
+ # authenticated in order to filter photo visible to the calling user
+ # but it seems works fine anonymously as well
+ 'people.getPhotos',
+ 'user_id',
+ get_NSID,
+ 'photos'
+ ),
+ # www.flickr.com/photos/{username|NSID}/galleries/{gallery-id}
+ (
+ pattern_url_gallery,
+ 'galleries.getPhotos',
+ 'gallery_id',
+ get_gallery_id,
+ 'photos'
+ ),
+ # www.flickr.com/groups/{groupname|groupNSID}/
+ (
+ pattern_url_group,
+ 'groups.pools.getPhotos',
+ 'group_id',
+ get_NSID,
+ 'photos'
+ ),
+ # www.flickr.com/photos/{username|NSID}/favorites/*
+ (
+ pattern_url_favorite,
+ 'favorites.getList',
+ 'user_id',
+ get_NSID,
+ 'photos'
+ )
+]
+
+def flickr_download_main(url, output_dir = '.', merge = False, info_only = False, **kwargs):
+ urls = None
+ size = 'o' # works for collections only
+ title = None
+ if 'stream_id' in kwargs:
+ size = kwargs['stream_id']
+ if match1(url, pattern_url_single_photo):
+ url, title = get_single_photo_url(url)
+ urls = [url]
+ else:
+ urls, title = fetch_photo_url_list(url, size)
+ index = 0
+ for url in urls:
mime, ext, size = url_info(url)
-
- print_info(site_info, title, mime, size)
+ print_info('Flickr.com', title, mime, size)
if not info_only:
- download_urls([url], title, ext, size, output_dir, merge=merge, faker=True)
+ suffix = '[%d]' % index
+ download_urls([url], title + suffix, ext, False, output_dir, None, False, False)
+ index = index + 1
- except: # extract images
- image = match1(page, r'