unescape HTML entities in media titles

This commit is contained in:
lilydjwg 2014-04-11 19:42:13 +08:00
parent ecb7e84e6b
commit 36f7cf798f
3 changed files with 24 additions and 2 deletions

View File

@ -11,7 +11,7 @@ import platform
import threading
from .version import __version__
from .util import log, legitimize, sogou_proxy_server
from .util import log, legitimize, sogou_proxy_server, unescape
dry_run = False
force = False
@ -523,6 +523,7 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None,
traceback.print_exc(file = sys.stdout)
pass
title = unescape(title)
title = legitimize(title)
filename = '%s.%s' % (title, ext)
@ -727,7 +728,7 @@ def print_info(site_info, title, type, size):
type_info = "Unknown type (%s)" % type
print("Video Site:", site_info)
print("Title: ", tr(title))
print("Title: ", unescape(tr(title)))
print("Type: ", type_info)
print("Size: ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)")
print()

View File

@ -3,3 +3,4 @@
from .fs import *
from .log import *
from .sogou_proxy import *
from .strings import *

View File

@ -0,0 +1,20 @@
try:
# py 3.4
from html import unescape
except ImportError:
import re
from html.entities import entitydefs
def unescape(string):
'''HTML entity decode'''
string = re.sub(r'&#[^;]+;', _sharp2uni, string)
string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string)
return string
def _sharp2uni(m):
'''&#...; ==> unicode'''
s = m.group(0)[2:].rstrip(';')
if s.startswith('x'):
return chr(int('0'+s, 16))
else:
return chr(int(s))