diff --git a/feed2toot/main.py b/feed2toot/main.py index 4d99bbb..9baf6a3 100644 --- a/feed2toot/main.py +++ b/feed2toot/main.py @@ -63,8 +63,9 @@ class Main(object): def main(self): """The main function.""" - # regex to retrieve urls in tags - regex = re.compile(r"]+src=\"([^\">]+)\"") + # regex_img_src to retrieve urls in tags + regex_img_src = re.compile(r"]*?\s+)?href=[\"'](.*?)[\"']|]+src=\"([^\">]+)\"") + regex_strip_urls = re.compile(r"([a-z]+\.twitter\.com(\/\S+)?|https?:\/\/twitter\.com((\/\w+)?)+)") http = urllib3.PoolManager() clip = CliParse() @@ -138,12 +139,17 @@ class Main(object): # get images contained in the entry images = [] if 'summary' in entry: - list_img_urls = regex.findall(entry['summary']) + list_img_urls = regex_img_src.findall(entry['summary']) if len(list_img_urls) > 0: - for img_url in list_img_urls: - resp = http.request('GET', img_url, preload_content=False) - images.append(resp) - resp.release_conn() + for img_url_tuple in list_img_urls: + for url in img_url_tuple: + if url != None and url != '': + resp = http.request('GET', url, preload_content=False) + if resp.headers['content-type'].startswith("image") or resp.headers['content-type'].startswith("video"): + logging.debug('Found media (type={type_media}) at url {url}'.format(type_media=resp.headers['content-type'], url=url)) + images.append(resp) + resp.release_conn() + severalwordsinhashtag = False # lets see if the rss feed has hashtag