aboutsummaryrefslogtreecommitdiff
path: root/test/requests/links_scraper/genelinks.py
diff options
context:
space:
mode:
authorzsloan2021-02-16 21:06:39 +0000
committerzsloan2021-02-16 21:06:39 +0000
commitecd189bd171efab8405dd3d8875ee402578bea6f (patch)
tree5090ae3e69246b0c6a7bbcf3649daf16bab68fba /test/requests/links_scraper/genelinks.py
parent31047da5d5a50ef2f6485584c9b38e6d8c9068a9 (diff)
parent187415f223b101f8c0b0ac100b2cf8e19c0ad3a5 (diff)
downloadgenenetwork2-ecd189bd171efab8405dd3d8875ee402578bea6f.tar.gz
Merge branch 'testing' of github.com:genenetwork/genenetwork2 into testing
Diffstat (limited to 'test/requests/links_scraper/genelinks.py')
-rw-r--r--test/requests/links_scraper/genelinks.py133
1 files changed, 133 insertions, 0 deletions
diff --git a/test/requests/links_scraper/genelinks.py b/test/requests/links_scraper/genelinks.py
new file mode 100644
index 00000000..12300f4a
--- /dev/null
+++ b/test/requests/links_scraper/genelinks.py
@@ -0,0 +1,133 @@
+import re
+import requests
+import urllib3
+import os
+import logging
+
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+
+
+PORT = os.environ.get("PORT", "5004")
+TEMPLATE_PATH = "../wqflask/wqflask/templates"
+
+BROKEN_LINKS = set()
+
+
+def search_templates():
+ """searches for broken links in templates"""
+ html_parsed_pages = []
+ for subdir, dirs, files in os.walk(TEMPLATE_PATH):
+ for file in files:
+ file_path = os.path.join(subdir, file)
+ if file_path.endswith(".html"):
+ parsed_page = soup(
+ open(file_path, encoding="utf8"), "html.parser")
+ html_parsed_pages.append(parsed_page)
+
+ return html_parsed_pages
+
+
+def is_valid_link(url_link):
+ try:
+ result = urlparse(url_link)
+ return all([result.scheme, result.netloc, result.path])
+ except Exception as e:
+ return False
+
+
+def test_link(link):
+ print(f'Checking -->{link}')
+ results = None
+ try:
+
+ results = requests.get(link, verify=False, timeout=10)
+ status_code = results.status_code
+
+ except Exception as e:
+ status_code = 408
+
+ return int(status_code) > 403
+
+
+def fetch_css_links(parsed_page):
+ print("fetching css links")
+ for link in parsed_page.findAll("link"):
+ full_path = None
+
+ link_url = link.attrs.get("href")
+ if is_valid_link(link_url):
+ full_path = link_url
+
+ elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
+ full_path = urljoin('http://localhost:5004/', link_url)
+
+ if full_path is not None:
+ if test_link(full_path):
+ BROKEN_LINKS.add(full_path)
+
+
+def fetch_html_links(parsed_page):
+ print("fetching a tags ")
+
+ for link in parsed_page.findAll("a"):
+ full_path = None
+ link_url = link.attrs.get("href")
+ if re.match(r"^/", link_url):
+ full_path = urljoin('http://localhost:5004/', link_url)
+
+ elif is_valid_link(link_url):
+ full_path = link_url
+
+ if full_path is not None:
+ if test_link(full_path):
+ BROKEN_LINKS.add(full_path)
+
+
+def fetch_script_tags(parsed_page):
+ print("--->fetching js links")
+ for link in parsed_page.findAll("script"):
+ js_link = link.attrs.get("src")
+ if js_link is not None:
+ if is_valid_link(js_link):
+ raise SystemExit("Failed,the library should be packaged in guix.\
+ Please contact,http://genenetwork.org/ for more details")
+
+ elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
+ full_path = urljoin('http://localhost:5004/', js_link)
+ if test_link(full_path):
+ BROKEN_LINKS.add(full_path)
+
+
+def fetch_page_links(page_url):
+
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+ html_page = uReq(page_url)
+ parsed_page = soup(html_page, "html.parser")
+
+ fetch_script_tags(parsed_page=parsed_page)
+ fetch_css_links(parsed_page=parsed_page)
+ fetch_html_links(parsed_page=parsed_page)
+
+
+def webpages_to_check():
+ pages = [f"http://localhost:{PORT}/"]
+
+ return pages
+
+
+if __name__ == '__main__':
+ # results = search_templates()
+
+ for page in webpages_to_check():
+ fetch_page_links(page)
+ if len(BROKEN_LINKS) > 0:
+ print("THE LINKS BELOW ARE BROKEN>>>>>>>>>>>>>")
+ for link in BROKEN_LINKS:
+ print(link)
+
+ if len(BROKEN_LINKS) > 0:
+ raise SystemExit(
+ "The links Above are broken.Please contact genenetwork.org<<<<<<<<")