From 14d05d3b2088fa405a92b6839332416c1d4287ed Mon Sep 17 00:00:00 2001 From: pl <0x00-pl@gmail.com> Date: Sat, 11 Oct 2014 00:15:54 +0800 Subject: [PATCH] fix douyutv title regex --- src/you_get/extractors/douyutv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/douyutv.py b/src/you_get/extractors/douyutv.py index 0292c94e..1dde4141 100644 --- a/src/you_get/extractors/douyutv.py +++ b/src/you_get/extractors/douyutv.py @@ -8,11 +8,11 @@ import json def douyutv_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) - room_id_patt = '"room_id":(\d{1,99}),' - title_patt = '([^<]{0,1000})' + room_id_patt = r'"room_id":(\d{1,99}),' + title_patt = r'
\s*

([^<]{1,9999})

\s*
' roomid = re.findall(room_id_patt,html)[0] - title = re.findall(title_patt,html)[0] + title = unescape_html(re.findall(title_patt,html)[0]) conf = get_html("http://www.douyutv.com/api/client/room/"+roomid) metadata = json.loads(conf)