aboutsummaryrefslogtreecommitdiff
path: root/test/requests/links_scraper/genelinks.py
diff options
context:
space:
mode:
authorAlexander Kabui2021-02-11 03:26:49 +0300
committerBonfaceKilz2021-02-13 16:55:08 +0300
commite1c3827d65a35d87e45cefe04135c1ff16374410 (patch)
treefe6fdf966ec4c84938935694f0b1c6a5df817ca6 /test/requests/links_scraper/genelinks.py
parentf3a8fdc660504e0ea74ae63d5ed7c891db6e3963 (diff)
downloadgenenetwork2-e1c3827d65a35d87e45cefe04135c1ff16374410.tar.gz
add links validator
Diffstat (limited to 'test/requests/links_scraper/genelinks.py')
-rw-r--r--test/requests/links_scraper/genelinks.py12
1 files changed, 11 insertions, 1 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
index 223a2c31..5dddcc47 100644
--- a/test/requests/links_scraper/genelinks.py
+++ b/test/requests/links_scraper/genelinks.py
@@ -7,11 +7,20 @@ import logging
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from urllib.parse import urljoin
+from urllib.parse import urlparse
PORT = os.environ.get("PORT", "5004")
+def is_valid_link(url_link):
+ try:
+ result = urlparse(url_link)
+ return all([result.scheme, result.netloc, result.path])
+ except Exception as e:
+ return False
+
+
def test_link(link, strict=True):
print(f"link testing {link}")
results = None
@@ -57,7 +66,8 @@ def fetch_html_links(parsed_page):
if re.match(r"^/", link_url):
full_path = urljoin('http://localhost:5004/', link_url)
- elif re.match(r'^http://', link_url):
+ elif is_valid_link(link_url):
+ print(link_url)
full_path = link_url
if full_path is not None: