diff options
author | Alexander Kabui | 2021-02-11 03:26:49 +0300 |
---|---|---|
committer | BonfaceKilz | 2021-02-13 16:55:08 +0300 |
commit | e1c3827d65a35d87e45cefe04135c1ff16374410 (patch) | |
tree | fe6fdf966ec4c84938935694f0b1c6a5df817ca6 /test/requests | |
parent | f3a8fdc660504e0ea74ae63d5ed7c891db6e3963 (diff) | |
download | genenetwork2-e1c3827d65a35d87e45cefe04135c1ff16374410.tar.gz |
add links validator
Diffstat (limited to 'test/requests')
-rw-r--r-- | test/requests/links_scraper/genelinks.py | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py index 223a2c31..5dddcc47 100644 --- a/test/requests/links_scraper/genelinks.py +++ b/test/requests/links_scraper/genelinks.py @@ -7,11 +7,20 @@ import logging from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup from urllib.parse import urljoin +from urllib.parse import urlparse PORT = os.environ.get("PORT", "5004") +def is_valid_link(url_link): + try: + result = urlparse(url_link) + return all([result.scheme, result.netloc, result.path]) + except Exception as e: + return False + + def test_link(link, strict=True): print(f"link testing {link}") results = None @@ -57,7 +66,8 @@ def fetch_html_links(parsed_page): if re.match(r"^/", link_url): full_path = urljoin('http://localhost:5004/', link_url) - elif re.match(r'^http://', link_url): + elif is_valid_link(link_url): + print(link_url) full_path = link_url if full_path is not None: |