开始采集
in Web Crawler 访问: 571 次 with 0 comment

开始采集

in Web Crawler with 0 comment

0ekKShZDHT8.jpg

开始采集

网络爬虫它们的本质就是一种递归方式。为了找到 URL 链接,它们必须首先获取网页内容,检查这个页面的内容,再寻找另一个 URL,然后获取 URL 对应的网页内容,不断循环这一过程。

遍历单个域名

任何页面并提取页面链接

#scrapetest.py
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs_obj = BeautifulSoup(html,"xml")
for link in bs_obj.find("div", {"id":"bodyContent"}).findAll("a",
                        href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

构建一个从一个页面到另一个页面的爬虫

from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())

def get_links(article_url):
    html = urlopen("http://en.wikipedia.org" + article_url)
    bs_obj = BeautifulSoup(html,"xml")
    return bs_obj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

links = get_links("/wiki/Kevin_Bacon")
while len(links) > 0:
    new_article = links[random.randint(0,len(links)-1)].attrs["href"]
    print(new_article)
    links = get_links(new_article)

用系统当前时间生成一个随机数生成器。这样可以保证在每次程序运行的时候,维基百科词条的选择都是一个全新的随机路径。

random.seed(datetime.datetime.now())

采集整个网站

链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()

def get_links(page_url):
    global pages
    html = urlopen("http://en.wikipedia.org" + page_url)
    bs_obj = BeautifulSoup(html,"xml")
    for link in bs_obj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # 我们遇到了新页面
                new_page = link.attrs['href']
                print(new_page)
                pages.add(new_page)
                get_links(new_page)

get_links("")
#scrapetest.py
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()

def get_links(page_url):
    global pages
    html = urlopen("http://en.wikipedia.org" + page_url)
    bs_obj = BeautifulSoup(html,"xml")
    try:
        print(bs_obj.h1.get_text())
        print(bs_obj.find(id="mw-content-text").findAll("p")[0])
        print(bs_obj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("页面缺少一些属性!不过不用担心!")

    for link in bs_obj.findAll("a", href = re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                new_page = link.attrs['href']
                print("----------------\n"+new_page)
                pages.add(new_page)
                get_links(new_page)

get_links("")

因为我们不可能确保每一页上都有所有类型的数据,所以每个打印语句都是按照数据在页面上出现的可能性从高到低排列的

通过互联网采集

#scrapetest.py
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

# 获取页面所有内链的列表
def get_internal_links(bs_obj, include_url):
    internal_links = []
    # 找出所有以"/"开头的链接
    for link in bs_obj.findAll("a", href = re.compile("^(/|.*" + include_url + ")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internal_links:
                internal_links.append(link.attrs['href'])
    return internal_links

# 获取页面所有外链的列表
def get_external_links(bs_obj, exclude_url):
    external_links = []
    # 找出所有以"http"或"www"开头且不包含当前URL的链接
    for link in bs_obj.findAll("a", href=re.compile("^(http|www)((?!" + exclude_url + ").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in external_links:
                external_links.append(link.attrs['href'])
    return external_links

def split_address(address):
    address_parts = address.replace("http://","").split("/")
    return address_parts

def get_random_external_link(starting_page):
    html = urlopen(starting_page)
    bs_obj = BeautifulSoup(html,"xml")
    external_links = get_external_links(bs_obj,split_address(starting_page)[0])
    if len(external_links) == 0:
        internal_links = get_internal_links(bs_obj, starting_page)
        return get_external_links(bs_obj, internal_links[random.randint(0, (len(external_links)-1))])
    else:
        return external_links[random.randint(0, (len(external_links)-1) )]

def follow_external_only(starting_page):
    external_links = get_random_external_link(starting_page)
    print("随机外链是:" + external_links)
    follow_external_only(external_links)

follow_external_only("http://iooy.com")

收集内链和外链

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

random.seed(datetime.datetime.now())

# 获取页面所有内链的列表
def get_internal_links(bs_obj, include_url):
    internal_links = []
    # 找出所有以"/"开头的链接
    for link in bs_obj.findAll("a", href = re.compile("^(/|.*" + include_url + ")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internal_links:
                internal_links.append(link.attrs['href'])
    return internal_links

# 获取页面所有外链的列表
def get_external_links(bs_obj, exclude_url):
    external_links = []
    # 找出所有以"http"或"www"开头且不包含当前URL的链接
    for link in bs_obj.findAll("a", href=re.compile("^(http|www)((?!" + exclude_url + ").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in external_links:
                external_links.append(link.attrs['href'])
    return external_links

def split_address(address):
    address_parts = address.replace("http://","").split("/")
    return address_parts

def get_random_external_link(starting_page):
    html = urlopen(starting_page)
    bs_obj = BeautifulSoup(html,"xml")
    external_links = get_external_links(bs_obj,split_address(starting_page)[0])
    if len(external_links) == 0:
        internal_links = get_internal_links(bs_obj, starting_page)
        return get_external_links(bs_obj, internal_links[random.randint(0, (len(external_links)-1))])
    else:
        return external_links[random.randint(0, (len(external_links)-1) )]

def follow_external_only(starting_page):
    external_links = get_random_external_link(starting_page)
    print("随机外链是:" + external_links)
    follow_external_only(external_links)

# 收集网站上发现的所有外链列表
all_exlinks = set()
all_inlinks = set()

def get_all_external_link(site_url):
    html = urlopen(site_url)
    bs_obj = BeautifulSoup(html,"xml")
    internal_links = get_internal_links(bs_obj, split_address(site_url)[0])
    external_links = get_external_links(bs_obj, split_address(site_url)[0])
    for link in external_links:
        if link not in all_exlinks:
            all_exlinks.add(link)
            print(link)
    for link in internal_links:
        if link not in all_inlinks:
            print("即将获取链接的URL是:"+link)
            all_inlinks.add(link)
            get_all_external_link(link)

get_all_external_link("http://iooy.com")
  1. urllib 库,它会自 动处理重定向。不过要注意,有时候你要采集的页面的 URL 可能并不是你当前所在页面的 URL。
Responses