Added url scanning for medias.
This commit is contained in:
parent
13b427942f
commit
81c0f8a15c
|
@ -63,8 +63,9 @@ class Main(object):
|
||||||
|
|
||||||
def main(self):
|
def main(self):
|
||||||
"""The main function."""
|
"""The main function."""
|
||||||
# regex to retrieve urls in <img> tags
|
# regex_img_src to retrieve urls in <img> tags
|
||||||
regex = re.compile(r"<img[^>]+src=\"([^\">]+)\"")
|
regex_img_src = re.compile(r"<a\s+(?:[^>]*?\s+)?href=[\"'](.*?)[\"']|<img[^>]+src=\"([^\">]+)\"")
|
||||||
|
regex_strip_urls = re.compile(r"([a-z]+\.twitter\.com(\/\S+)?|https?:\/\/twitter\.com((\/\w+)?)+)")
|
||||||
http = urllib3.PoolManager()
|
http = urllib3.PoolManager()
|
||||||
|
|
||||||
clip = CliParse()
|
clip = CliParse()
|
||||||
|
@ -138,12 +139,17 @@ class Main(object):
|
||||||
# get images contained in the entry
|
# get images contained in the entry
|
||||||
images = []
|
images = []
|
||||||
if 'summary' in entry:
|
if 'summary' in entry:
|
||||||
list_img_urls = regex.findall(entry['summary'])
|
list_img_urls = regex_img_src.findall(entry['summary'])
|
||||||
if len(list_img_urls) > 0:
|
if len(list_img_urls) > 0:
|
||||||
for img_url in list_img_urls:
|
for img_url_tuple in list_img_urls:
|
||||||
resp = http.request('GET', img_url, preload_content=False)
|
for url in img_url_tuple:
|
||||||
images.append(resp)
|
if url != None and url != '':
|
||||||
resp.release_conn()
|
resp = http.request('GET', url, preload_content=False)
|
||||||
|
if resp.headers['content-type'].startswith("image") or resp.headers['content-type'].startswith("video"):
|
||||||
|
logging.debug('Found media (type={type_media}) at url {url}'.format(type_media=resp.headers['content-type'], url=url))
|
||||||
|
images.append(resp)
|
||||||
|
resp.release_conn()
|
||||||
|
|
||||||
|
|
||||||
severalwordsinhashtag = False
|
severalwordsinhashtag = False
|
||||||
# lets see if the rss feed has hashtag
|
# lets see if the rss feed has hashtag
|
||||||
|
|
Loading…
Reference in a new issue