Generate a list of urls from a website using Python

import httplib2
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urlparse
import time

start_time = time.time()
http = httplib2.Http()


def _crawlr(website, depth=1):
    if (not website.endswith('/')):
        website = website + "/"
    links = []
    links_list = []
    links_list.append(website)

    parsed_uri = urlparse(website)
    domain = parsed_uri.hostname
    i = 0
    for x in range(depth):
        for webPage in links_list:
            i = i + 1
            try:
                status, response = http.request(webPage)
                for link in BeautifulSoup(response, "html.parser", parse_only=SoupStrainer('a')):
                    if link.has_attr('href'):
                        linkDetails = link['href']

                        if (domain not in linkDetails):
                            linkDetails = website + linkDetails

                        if (domain in linkDetails):
                            if (website in linkDetails):
                                if (linkDetails.strip() not in links):
                                    links.append(linkDetails.strip())

            except Exception:
                pass

        links_list = list(set(links_list).union(set(links)))
    return links


sitemap = _crawlr('https://codesnap.io', 1)
print("--- %s seconds ---" % (time.time() - start_time))

 

Top