Python爬取網頁的所有內外鏈

專注的阿熊發表於2021-04-09

class Queue(object):

    # 初始化佇列

    def __init__(self):

        self.items = []      

    # 入隊

    def enqueue(self, item):

        self.items.append(item)

    # 出隊

    def dequeue(self):

        if self.is_Empty():

            print(" 當前佇列為空!! ")

        else:

            return self.items.pop(0)        

    # 判斷是否為空

    def is_Empty(self):

        return self.items == []       

    # 佇列長度

    def size(self):

        return len(self.items)    

    # 返回隊頭元素,如果佇列為空的話,返回 None

    def front(self):

        if self.is_Empty():

            print(" 當前佇列為空!! ")

        else:

            return self.items[len(self.items) - 1]

# 匯入庫

from urllib.request import urlopen

from urllib.parse import urlparse

from bs4 import BeautifulSoup

import requests

import re

import urllib.parse

import time

import random

queueInt = Queue()   # 儲存內鏈的佇列

queueExt = Queue()   # 儲存外鏈的佇列

externalLinks = []

internalLinks = []

# 獲取頁面中所有外鏈的列表

def getExterLinks(bs, exterurl):    

    # 找出所有以 www http 開頭且不包含當前 URL 的連結

    for link in bs.find_all('a', href = re.compile

                            ('^(http|www)((?!'+urlparse(exterurl).netloc+').)*$')):

        # 按照標準, URL 只允許一部分 ASCII 字元,其他字元(如漢字)是不符合標準的,

        # 我們的連結網址可能存在漢字的情況,此時就要進行編碼。

        link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')

        if link.attrs['href'] is not None:

            if link.attrs['href'] not in externalLinks:

                queueExt.enqueue(link.attrs['href'])

                externalLinks.append(link.attrs['href'])

                print(link.attrs['href'])

#     return externalLinks

# 獲取頁面中所以內鏈的列表    

def getInterLinks(bs, interurl):

    interurl = '{}://{}'.format(urlparse(interurl).scheme,

                                urlparse(interurl).netloc)  

    # 找出所有以“ / ”開頭的內部連結

    for link in bs.find_all('a', href = re.compile

                            ('^(/|.*'+urlparse(interurl).netloc+')')):

        link.attrs['href'] = urllib.parse.quote(link.attrs['href'],safe='?=&:/')

        if link.attrs['href'] is not None:

            if link.attrs['href'] not in internalLinks:

        #startsWith()外匯跟單gendan5.com 方法用來判斷當前字串是否是以另外一個給定的子字串“開頭”的

                if(link.attrs['href'].startswith('//')):

                    if interurl+link.attrs['href'] not in internalLinks:

                        queueInt.enqueue(interurl+link.attrs['href'])

                        internalLinks.append(interurl+link.attrs['href'])

                elif(link.attrs['href'].startswith('/')):

                    if interurl+link.attrs['href'] not in internalLinks:

                        queueInt.enqueue(interurl+link.attrs['href'])

                        internalLinks.append(interurl+link.attrs['href'])

                else:

                    queueInt.enqueue(link.attrs['href'])

                    internalLinks.append(link.attrs['href'])

#     return internalLinks

def deepLinks():

    num = queueInt.size()

    while num > 1:

        i = queueInt.dequeue()

        if i is None:

            break

        else:

            print(' 訪問的內鏈 ')

            print(i)

            print(' 找到的新外鏈 ')

    #         html = urlopen(i)

            html=requests.get(i,headers=headers_)

            time.sleep(random.random()*3)

            domain1 = '{}://{}'.format(urlparse(i).scheme, urlparse(i).netloc)

            bs = BeautifulSoup(html.content, 'html.parser')

            getExterLinks(bs, domain1)

            getInterLinks(bs, domain1)

def getAllLinks(url):

    global num

#     html = urlopen(url)

    headers_={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'}

    html = requests.get(url,headers=headers_)

    time.sleep(random.random()*3) # 模擬人類行為,間隔隨機的時間

    domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)

    bs = BeautifulSoup(html.content, 'html.parser')

    getInterLinks(bs, domain)

    getExterLinks(bs, domain)

    deepLinks()

getAllLinks('https://image.baidu.com/')


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2767312/,如需轉載,請註明出處,否則將追究法律責任。

相關文章