test/requests/links_scraper/genelinks.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

import re
import requests
import urllib3
import os
import logging

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from urllib.parse import urljoin


PORT = os.environ.get("PORT", "5004")


def test_link(link, strict=True):
    print(f"link testing {link}")
    results = None
    try:

        results = requests.get(link, verify=False, timeout=10)

    except Exception as e:
        if strict:
            raise SystemExit(
                "The link does not exists or is wrongly formatted")
        else:
            logging.error(f"FAILED:{link} does not exists or is wrongly formatted")

    status_code = results.status_code if results is not None else "404"

    print(f'the link {link} ---> {status_code}')


def fetch_css_links(parsed_page):
    print("fetching css links")
    for link in parsed_page.findAll("link"):
        full_path = None

        link_url = link.attrs.get("href")
        if re.match(r"^http://", link_url):
            pass
            # not sure whether to raise an error here for external css links

        elif re.match(r"^/css", link_url) or re.match(r"^/js", link_url):
            full_path = urljoin('http://localhost:5004/', link_url)

        if full_path is not None:
            test_link(full_path)


def fetch_html_links(parsed_page):
    print("fetching a tags ")

    for link in parsed_page.findAll("a"):
        full_path = None
        link_url = link.attrs.get("href")
        if re.match(r"^/", link_url):
            full_path = urljoin('http://localhost:5004/', link_url)

        elif re.match(r'^http://', link_url):
            full_path = link_url

        if full_path is not None:
            test_link(full_path)


def fetch_script_tags(parsed_page):
    print("--->fetching js links")
    for link in parsed_page.findAll("script"):
        js_link = link.attrs.get("src")
        if js_link is not None:
            if re.match(r'^http://', js_link):
                raise SystemExit("Failed,the library should be packaged in guix.\
                                Please contact,http://genenetwork.org/ for more details")

            elif re.match(r"^/css", js_link) or re.match(r"^/js", js_link):
                full_path = urljoin('http://localhost:5004/', js_link)
                test_link(full_path)


def fetch_page_links(page_url):

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    html_page = uReq(page_url)
    parsed_page = soup(html_page, "html.parser")

    fetch_script_tags(parsed_page=parsed_page)
    fetch_css_links(parsed_page=parsed_page)
    fetch_html_links(parsed_page=parsed_page)


fetch_page_links(f"http://localhost:{PORT}/")