unescape HTML entities in media titles

2025-01-23 21:45:02 +03:00 · 2014-04-11 19:42:13 +08:00 · 2014-04-11 19:42:13 +08:00 · 36f7cf798f
commit 36f7cf798f
parent ecb7e84e6b
3 changed files with 24 additions and 2 deletions
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@ -11,7 +11,7 @@ import platform
 import threading

 from .version import __version__
-from .util import log, legitimize, sogou_proxy_server
+from .util import log, legitimize, sogou_proxy_server, unescape

 dry_run = False
 force = False
@ -523,6 +523,7 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None,
            traceback.print_exc(file = sys.stdout)
            pass
    
+    title = unescape(title)
    title = legitimize(title)
    
    filename = '%s.%s' % (title, ext)
@ -727,7 +728,7 @@ def print_info(site_info, title, type, size):
        type_info = "Unknown type (%s)" % type
    
    print("Video Site:", site_info)
-    print("Title:     ", tr(title))
+    print("Title:     ", unescape(tr(title)))
    print("Type:      ", type_info)
    print("Size:      ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)")
    print()
--- a/src/you_get/util/init.py
+++ b/src/you_get/util/init.py
@ -3,3 +3,4 @@
 from .fs import *
 from .log import *
 from .sogou_proxy import *
+from .strings import *
--- a/src/you_get/util/strings.py
+++ b/src/you_get/util/strings.py
@ -0,0 +1,20 @@
+try:
+  # py 3.4
+  from html import unescape
+except ImportError:
+  import re
+  from html.entities import entitydefs
+
+  def unescape(string):
+    '''HTML entity decode'''
+    string = re.sub(r'&#[^;]+;', _sharp2uni, string)
+    string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string)
+    return string
+
+  def _sharp2uni(m):
+    '''&#...; ==> unicode'''
+    s = m.group(0)[2:].rstrip(';；')
+    if s.startswith('x'):
+      return chr(int('0'+s, 16))
+    else:
+      return chr(int(s))