Added url scanning for medias.

2017-07-06 13:04:45 +02:00 · 2017-07-06 13:04:45 +02:00 · 81c0f8a15c
parent 13b427942f
commit 81c0f8a15c
1 changed files with 13 additions and 7 deletions
--- a/feed2toot/main.py
+++ b/feed2toot/main.py
@ -63,8 +63,9 @@ class Main(object):

    def main(self):
        """The main function."""
-        # regex to retrieve urls in <img> tags
-        regex = re.compile(r"<img[^>]+src=\"([^\">]+)\"")
+        # regex_img_src to retrieve urls in <img> tags
+        regex_img_src = re.compile(r"<a\s+(?:[^>]*?\s+)?href=[\"'](.*?)[\"']|<img[^>]+src=\"([^\">]+)\"")
+        regex_strip_urls = re.compile(r"([a-z]+\.twitter\.com(\/\S+)?|https?:\/\/twitter\.com((\/\w+)?)+)")
        http = urllib3.PoolManager()

        clip = CliParse()
@ -138,12 +139,17 @@ class Main(object):
                    # get images contained in the entry
                    images = []
                    if 'summary' in entry:
-                        list_img_urls = regex.findall(entry['summary'])
+                        list_img_urls = regex_img_src.findall(entry['summary'])
                        if len(list_img_urls) > 0:
-                            for img_url in list_img_urls:
-                                resp = http.request('GET', img_url, preload_content=False)
-                                images.append(resp)
-                                resp.release_conn()
+                            for img_url_tuple in list_img_urls:
+                                for url in img_url_tuple:
+                                    if url != None and url != '':
+                                        resp = http.request('GET', url, preload_content=False)
+                                        if resp.headers['content-type'].startswith("image") or resp.headers['content-type'].startswith("video"):
+                                            logging.debug('Found media (type={type_media}) at url {url}'.format(type_media=resp.headers['content-type'], url=url))
+                                            images.append(resp)
+                                            resp.release_conn()
+

                    severalwordsinhashtag = False
                    # lets see if the rss feed has hashtag