Encoding handling fixes:

1. Introduce string.safe_chars, safe_print as ways to ensure that a
string is encodable using the specified encoding.  Unsafe characters
are replaced with '?'.  safe_print delegates to print and satisfies
the same interface, so it can be used as a drop-in override for print
in any file.

2. Move get_filename to fs, since that's where it belongs (fs-related
filename handling).  Move appending of ID, part number, and extension
(when applicable) to get_filename, to avoid accidental truncation.

3. Remove common.tr, since the print override supercedes it.

4. Refactor of log module to work with changes (use print with different
files instead of direct writes to stdout, stderr).

5. Modify other files to accommodate the changes (remove calls to tr)

6. Random cleanup I found:

a. Some changes to impl of download_urls, download_urls_chunked (is this
one even used?)).
b. sina_download_by_id?
c. ffmpeg_convert_ts_to_mkv tries to convert multiple input files onto
the same output file, overwriting its own output each time?
d. @staticmethod annotations (IDE sads otherwise).

7. Tests for the new encoding handling.
This commit is contained in:
henryptung 2015-01-10 22:13:09 -08:00
parent 1b55b01b04
commit 79fd1255cb
16 changed files with 152 additions and 130 deletions

View File

@ -11,7 +11,8 @@ from urllib import request, parse
from .version import __version__
from .util import log
from .util.strings import get_filename, unescape_html
from .util.strings import unescape_html, safe_print as print
from .util.fs import get_filename
dry_run = False
force = False
@ -27,18 +28,6 @@ fake_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0'
}
if sys.stdout.isatty():
default_encoding = sys.stdout.encoding.lower()
else:
default_encoding = locale.getpreferredencoding().lower()
def tr(s):
if default_encoding == 'utf-8':
return s
else:
return s
#return str(s.encode('utf-8'))[2:-1]
# DEPRECATED in favor of match1()
def r1(pattern, text):
m = re.search(pattern, text)
@ -272,7 +261,7 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False):
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
print('Skipping %s: file already exists' % os.path.basename(filepath))
else:
if bar:
bar.update_received(file_size)
@ -281,7 +270,7 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False):
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
print('Overwriting %s' % os.path.basename(filepath), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
@ -348,7 +337,7 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker =
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
print('Skipping %s: file already exists' % os.path.basename(filepath))
else:
if bar:
bar.update_received(os.path.getsize(filepath))
@ -357,7 +346,7 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker =
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
print('Overwriting %s' % os.path.basename(filepath), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
@ -490,13 +479,10 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg
total_size = urls_size(urls)
except:
import traceback
import sys
traceback.print_exc(file = sys.stdout)
pass
title = tr(get_filename(title))
filename = '%s.%s' % (title, ext)
filename = get_filename(title, ext)
filepath = os.path.join(output_dir, filename)
if total_size:
if not force and os.path.exists(filepath) and os.path.getsize(filepath) >= total_size * 0.9:
@ -507,35 +493,32 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg
else:
bar = PiecesProgressBar(total_size, len(urls))
print('Downloading %s ...' % filename)
if len(urls) == 1:
url = urls[0]
print('Downloading %s ...' % tr(filename))
url_save(url, filepath, bar, refer = refer, faker = faker)
bar.done()
else:
parts = []
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
part_filepath = os.path.join(output_dir, get_filename(title, ext, part=i))
parts.append(part_filepath)
#print('Downloading %s [%s/%s]...' % (filename, i + 1, len(urls)))
bar.update_piece(i + 1)
url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker)
url_save(url, part_filepath, bar, refer = refer, is_part = True, faker = faker)
bar.done()
from .processor import ffmpeg
if not merge:
print()
return
if ext in ['flv', 'f4v']:
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_flv_to_mp4
ffmpeg_concat_flv_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
if ffmpeg.has_ffmpeg_installed():
ffmpeg.ffmpeg_concat_flv_to_mp4(parts, filepath)
else:
from .processor.join_flv import concat_flv
concat_flv(parts, os.path.join(output_dir, title + '.flv'))
concat_flv(parts, filepath)
except:
raise
else:
@ -544,13 +527,11 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg
elif ext == 'mp4':
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4
ffmpeg_concat_mp4_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
if ffmpeg.has_ffmpeg_installed():
ffmpeg.ffmpeg_concat_mp4_to_mp4(parts, filepath)
else:
from .processor.join_mp4 import concat_mp4
concat_mp4(parts, os.path.join(output_dir, title + '.mp4'))
concat_mp4(parts, filepath)
except:
raise
else:
@ -574,68 +555,59 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No
assert ext in ('ts')
title = tr(get_filename(title))
filename = '%s.%s' % (title, 'ts')
filename = get_filename(title, '.mkv')
filepath = os.path.join(output_dir, filename)
if total_size:
if not force and os.path.exists(filepath[:-3] + '.mkv'):
print('Skipping %s: file already exists' % filepath[:-3] + '.mkv')
if not force and os.path.exists(filepath):
print('Skipping %s: file already exists' % filepath)
print()
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
print('Downloading %s ...' % filename)
if len(urls) == 1:
parts = []
temp_filepath = os.path.join(output_dir, get_filename(title, ext))
url = urls[0]
print('Downloading %s ...' % tr(filename))
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker)
url_save_chunked(url, temp_filepath, bar, refer = refer, faker = faker)
bar.done()
from .processor import ffmpeg
if not merge:
print()
return
if ext == 'ts':
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_convert_ts_to_mkv
if ffmpeg_convert_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv')):
for part in parts:
os.remove(part)
if ffmpeg.has_ffmpeg_installed():
if ffmpeg.ffmpeg_convert_ts_to_mkv(temp_filepath, filepath):
os.remove(temp_filepath)
else:
os.remove(os.path.join(output_dir, title + '.mkv'))
os.remove(filepath)
else:
print('No ffmpeg is found. Conversion aborted.')
else:
print("Can't convert %s files" % ext)
else:
parts = []
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
part_filepath = os.path.join(output_dir, get_filename(title, ext, part=i))
parts.append(part_filepath)
#print('Downloading %s [%s/%s]...' % (filename, i + 1, len(urls)))
bar.update_piece(i + 1)
url_save_chunked(url, filepath, bar, refer = refer, is_part = True, faker = faker)
url_save_chunked(url, part_filepath, bar, refer = refer, is_part = True, faker = faker)
bar.done()
from .processor import ffmpeg
if not merge:
print()
return
if ext == 'ts':
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_ts_to_mkv
if ffmpeg_concat_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv')):
if ffmpeg.has_ffmpeg_installed():
if ffmpeg.ffmpeg_concat_ts_to_mkv(parts, filepath):
for part in parts:
os.remove(part)
else:
os.remove(os.path.join(output_dir, title + '.mkv'))
os.remove(filepath)
else:
print('No ffmpeg is found. Merging aborted.')
else:
@ -717,7 +689,7 @@ def print_info(site_info, title, type, size):
type_info = "Unknown type (%s)" % type
print("Video Site:", site_info)
print("Title: ", unescape_html(tr(title)))
print("Title: ", unescape_html(title))
print("Type: ", type_info)
print("Size: ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)")
print()

View File

@ -2,6 +2,7 @@
from .common import match1, download_urls, parse_host, set_proxy, unset_proxy
from .util import log
from .util.strings import safe_print as print
class Extractor():
def __init__(self, *args):

View File

@ -40,11 +40,11 @@ def acfun_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only
raise NotImplementedError(sourceType)
if not info_only:
title = get_filename(title)
filename = get_filename(title, '.cmt.json', id=vid)
try:
print('Downloading %s ...\n' % (title + '.cmt.json'))
print('Downloading %s ...\n' % filename)
cmt = get_srt_json(vid)
with open(os.path.join(output_dir, title + '.cmt.json'), 'w') as x:
with open(os.path.join(output_dir, filename), 'w') as x:
x.write(cmt)
# print('Downloading %s ...\n' % (title + '.cmt_lock.json'))
# cmt = get_srt_lock_json(danmakuId)

View File

@ -150,7 +150,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False):
bilibili_download_by_cids(cids, title, output_dir=output_dir, merge=merge, info_only=info_only)
elif t == 'vid':
sina_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
sina_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
elif t == 'ykid':
youku_download_by_vid(id, title=title, output_dir = output_dir, merge = merge, info_only = info_only)
elif t == 'uid':
@ -159,10 +159,10 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False):
raise NotImplementedError(flashvars)
if not info_only:
title = get_filename(title)
print('Downloading %s ...\n' % (title + '.cmt.xml'))
filename = get_filename(title, '.cmt.xml', id=id)
print('Downloading %s ...\n' % filename)
xml = get_srt_xml(id)
with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as x:
x.write(xml)
site_info = "bilibili.com"

View File

@ -107,12 +107,12 @@ def download_url_chunked(url, title, ext, size, output_dir = '.', refer = None,
filepath = os.path.join(output_dir, filename)
if not force and os.path.exists(filepath):
print('Skipping %s: file already exists' % tr(filepath))
print('Skipping %s: file already exists' % filepath)
print()
return
bar = DummyProgressBar()
print('Downloading %s ...' % tr(filename))
print('Downloading %s ...' % filename)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker)
bar.done()

View File

@ -14,12 +14,12 @@ def parse_size(size):
else:
return 0
def dongting_download_lyric(lrc_url, file_name, output_dir):
def dongting_download_lyric(lrc_url, basename, sid, output_dir):
j = get_html(lrc_url)
info = json.loads(j)
lrc = j['data']['lrc']
filename = get_filename(file_name)
with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x:
lrc = info['data']['lrc']
filename = get_filename(basename, '.lrc', id=sid)
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as x:
x.write(lrc)
def dongting_download_song(sid, output_dir = '.', merge = True, info_only = False):
@ -35,13 +35,13 @@ def dongting_download_song(sid, output_dir = '.', merge = True, info_only = Fals
print_info(site_info, song_title, ext, size)
if not info_only:
file_name = "%s - %s - %s" % (song_title, album_name, artist)
download_urls([url], file_name, ext, size, output_dir, merge = merge)
basename = "%s - %s - %s" % (song_title, album_name, artist)
download_urls([url], basename, ext, size, output_dir, merge = merge)
lrc_url = ('http://lp.music.ttpod.com/lrc/down?'
'lrcid=&artist=%s&title=%s') % (
parse.quote(artist), parse.quote(song_title))
try:
dongting_download_lyric(lrc_url, file_name, output_dir)
dongting_download_lyric(lrc_url, basename, output_dir)
except:
pass

View File

@ -27,11 +27,11 @@ def location_dec(str):
out += char
return parse.unquote(out).replace("^", "0")
def xiami_download_lyric(lrc_url, file_name, output_dir):
def xiami_download_lyric(lrc_url, basename, sid, output_dir):
lrc = get_html(lrc_url, faker = True)
filename = get_filename(file_name)
filename = get_filename(basename, '.lrc', id=sid)
if len(lrc) > 0:
with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x:
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as x:
x.write(lrc)
def xiami_download_pic(pic_url, file_name, output_dir):
@ -61,10 +61,10 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False):
print_info(site_info, song_title, ext, size)
if not info_only:
file_name = "%s - %s - %s" % (song_title, album_name, artist)
download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True)
basename = "%s - %s - %s" % (song_title, album_name, artist)
download_urls([url], basename, ext, size, output_dir, merge = merge, faker = True)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
xiami_download_lyric(lrc_url, basename, output_dir)
except:
pass

View File

@ -19,6 +19,7 @@ class Youku(VideoExtractor):
{'id': '3gphd', 'container': '3gp', 'video_profile': '高清3GP'},
]
@staticmethod
def generate_ep(vid, ep):
f_code_1 = 'becaf9be'
f_code_2 = 'bf7e5f01'
@ -49,9 +50,11 @@ class Youku(VideoExtractor):
new_ep = trans_e(f_code_2, '%s_%s_%s' % (sid, vid, token))
return base64.b64encode(bytes(new_ep, 'latin')), sid, token
@staticmethod
def parse_m3u8(m3u8):
return re.findall(r'(http://[^?]+)\?ts_start=0', m3u8)
@staticmethod
def get_vid_from_url(url):
"""Extracts video ID from URL.
"""
@ -59,6 +62,7 @@ class Youku(VideoExtractor):
match1(url, r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf') or \
match1(url, r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)')
@staticmethod
def get_playlist_id_from_url(url):
"""Extracts playlist ID from URL.
"""

View File

@ -23,8 +23,7 @@ FFMPEG, FFMPEG_VERSION = get_usable_ffmpeg('ffmpeg') or get_usable_ffmpeg('avcon
def has_ffmpeg_installed():
return FFMPEG is not None
def ffmpeg_convert_ts_to_mkv(files, output='output.mkv'):
for file in files:
def ffmpeg_convert_ts_to_mkv(file, output='output.mkv'):
if os.path.isfile(file):
params = [FFMPEG, '-y', '-i']
params.append(file)

View File

@ -2,6 +2,7 @@
import os.path
import subprocess
from ..util.strings import safe_print as print
def get_usable_rtmpdump(cmd):
try:

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python
import platform
import sys
from .strings import safe_chars
def legitimize(text, os=platform.system()):
"""Converts a string to a valid filename.
@ -41,5 +43,13 @@ def legitimize(text, os=platform.system()):
if text.startswith("."):
text = text[1:]
text = text[:82] # Trim to 82 Unicode characters long
return text
def get_filename(basename, ext, id=None, part=None, encoding=sys.getfilesystemencoding(), **kwargs):
safe_basename = safe_chars(basename, encoding=encoding)
if safe_basename != basename and id is not None:
safe_basename = safe_chars('%s - %s' % (basename, id), encoding=encoding)
safe_basename = safe_basename[:82] # Trim to 82 Unicode characters long
if part is not None:
safe_basename = '%s[%02d]' % (safe_basename, part)
return legitimize('%s.%s' % (safe_basename, ext), **kwargs)

View File

@ -2,6 +2,7 @@
# This file is Python 2 compliant.
from .. import __name__ as library_name
from .strings import safe_print as print
import os, sys
@ -62,15 +63,15 @@ def sprint(text, *colors):
def println(text, *colors):
"""Print text to standard output."""
sys.stdout.write(sprint(text, *colors) + "\n")
print(sprint(text, *colors), file=sys.stdout)
def print_err(text, *colors):
"""Print text to standard error."""
sys.stderr.write(sprint(text, *colors) + "\n")
print(sprint(text, *colors), file=sys.stderr)
def print_log(text, *colors):
"""Print a log message to standard error."""
sys.stderr.write(sprint("{}: {}".format(library_name, text), *colors) + "\n")
print_err("{}: {}".format(library_name, text), *colors)
def i(message):
"""Print a normal log message."""

View File

@ -19,7 +19,11 @@ except ImportError:
else:
return chr(int(s))
from .fs import legitimize
import sys
def get_filename(htmlstring):
return legitimize(unescape_html(htmlstring))
def safe_chars(s, encoding=sys.getdefaultencoding()):
return s.encode(encoding, 'replace').decode(encoding)
def safe_print(*objects, file=sys.stdout, **kwargs):
safe_strs = [safe_chars(str(obj), encoding=file.encoding) for obj in objects]
print(*safe_strs, file=file, **kwargs)

View File

@ -1,11 +0,0 @@
#!/usr/bin/env python
import unittest
from you_get.util.fs import *
class TestUtil(unittest.TestCase):
def test_legitimize(self):
self.assertEqual(legitimize("1*2", os="Linux"), "1*2")
self.assertEqual(legitimize("1*2", os="Darwin"), "1*2")
self.assertEqual(legitimize("1*2", os="Windows"), "1-2")

28
tests/util/test_fs.py Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env python
import unittest
from you_get.util.fs import *
class TestFs(unittest.TestCase):
def test_legitimize(self):
self.assertEqual(legitimize("1*2", os="Linux"), "1*2")
self.assertEqual(legitimize("1*2", os="Darwin"), "1*2")
self.assertEqual(legitimize("1*2", os="Windows"), "1-2")
def test_get_filename_simple(self):
self.assertEqual('name.ext', get_filename('name', 'ext', os='Linux', encoding='utf-8'))
def test_get_filename_parts(self):
self.assertEqual('name[02].ext', get_filename('name', 'ext', part=2, os='Linux', encoding='utf-8'))
self.assertEqual('name(02).ext', get_filename('name', 'ext', part=2, os='Windows', encoding='utf-8'))
def test_get_filename_encoding_error(self):
self.assertEqual('name\u20AC.ext', get_filename('name\u20AC', 'ext', os='Linux', encoding='utf-8'))
self.assertEqual('name\u20AC.ext', get_filename('name\u20AC', 'ext', os='Windows', encoding='utf-8'))
self.assertEqual('name?.ext', get_filename('name\u20AC', 'ext', os='Linux', encoding='ascii'))
self.assertEqual('name-.ext', get_filename('name\u20AC', 'ext', os='Windows', encoding='ascii'))
def test_get_filename_id(self):
self.assertEqual('name\u20AC.ext', get_filename('name\u20AC', 'ext', os='Linux', id='hi', encoding='utf-8'))
self.assertEqual('name? - hi.ext', get_filename('name\u20AC', 'ext', os='Linux', id='hi', encoding='ascii'))

View File

@ -0,0 +1,13 @@
#!/usr/bin/env python
import unittest
from you_get.util.strings import *
class TestStrings(unittest.TestCase):
def test_safe_chars_simple(self):
self.assertEqual('', safe_chars('', encoding='utf-8'))
self.assertEqual('abc', safe_chars('abc', encoding='utf-8'))
def test_safe_chars_replace(self):
self.assertEqual('a?c', safe_chars('a\u20ACc', encoding='ascii'))