python爬虫,去抓亚马逊、当当、豆瓣的信息。

作者:我就是个世界 发表于:2009-05-14
今天在douban上看到有人用python写的图书爬虫,去抓亚马逊、当当、豆瓣图书信息和评论,然后调用模版生成整个图书网站。

相当不错!学习学习!
  
演示:www.hilaosan.com/

总共97本书的信息在里面。
首页显示15本,这是从97本里面随机选出来的。
详细页面除了图书的简单信息,其它的都是用户的评论,评论都会表明是亚马逊网友、当当网友、豆瓣网友发布的。
以下是爬虫的代码,一个是抓卓越网排行榜的图书页面,一个是抓这些图书页面的信息、评论。[separator]

[code]
# -*- coding: utf-8 -*-
import urllib,re,time

class spider:
    def __init__(self, sUrl,detailRegex, listRegex):
        self.listUrls=[sUrl]
        self.detailUrls=[]
        self.detailRegex = detailRegex
        self.listRegex = listRegex
        

    def controlSpider(self):
        cnt = 0
        for i in self.listUrls:
            self.getDetailUrl(i)
            self.getListUrl(i)
            cnt = cnt + 1
            print 'begin to crawl' + i
            print cnt
            time.sleep(2)
        self.saveDetailUrl() #保存详细页面的url
            
    def getDetailUrl(self,url):
        content = self.getContent(url)
        r = re.compile(self.detailRegex, re.I|re.U|re.S)
        m = r.findall(content)
        for n in m:
            if n not in self.detailUrls:
                self.detailUrls.append(n)
                print 'find ' + n + ', and add it to detail list.'

    
    def getListUrl(self, url):
        content = self.getContent(url)
        r = re.compile(self.listRegex, re.I|re.U)
        m = r.findall(content)
        for n in m:
            if n not in self.listUrls:
                self.listUrls.append(n)
                print 'find ' + n + ', and add it to list page.'

    def getContent(self,url):
        try:
            openurl = urllib.urlopen(url)
            content = openurl.read()
            openurl.close()
            return content
        except:
            print 'error in geting url' + url
            return ''

    def saveDetailUrl(self):
        file = open('detailurl.txt','w')
        file.write('\n'.join(self.detailUrls))
        file.close()


if __name__=='__main__':
    startUrl='http://www.amazon.cn/mn/rank?nodeid=51080&page=1&uid=168-6258410-2196214'
    detailRegex = '<div class="pic"><a href="(http://www.amazon.cn/dp/[0-9a-z]+)"><img'
    #listRegex = '<a href="(http://www.amazon.cn/mn/[store]*?rank.+?)"'
    listRegex = '<a href="(http://www.amazon.cn/mn/rank.nodeid=51080.+?)"'
    s = spider(startUrl, detailRegex, listRegex)
    s.controlSpider() #把detail页面打印出来,并保存为txt文件。
    print 'Finish crawling。。。。。。'

[/code]

[code]
# -*- coding: utf-8 -*-
import re ,urllib
isbns = []

class GetDangdangDoubanAmazonUrl():
    def __init__(self, sUrl):
        self.startUrl = sUrl
        self.isbns = []
        #isbnRegex = regex1

    def controlSpider(self):
        isbn = self.getISBN(self.startUrl)
        
        if isbn:
            #解析当当网的url
            myID = self.getDangdangUrl(isbn)
            if myID:
                ddUrl = 'http://comm.dangdang.com/reviewlist/' + myID +'/showall/?sort=useful&setcount=20'
            else:
                print 'can not parse the dangdang\'s ID at ISBN ' + isbn
                ddUrl =''


            #解析豆瓣网的url
            myID = self.getDoubanUrl(isbn)
            if myID:
                doubanUrl = 'http://www.douban.com/subject/' + myID +'/reviews'
            else:
                print 'can not parse the douban\'s ID at ISBN ' + isbn
                doubanUrl = ''

            
            #解析亚马逊卓越的url
            amazonUrl = 'http://www.amazon.cn/mn/productReviewApplication?uid=168-6258410-2196214&prodid='\
                        + self.startUrl.replace('http://www.amazon.cn/dp/','')
            self.getBookInfo(isbn, self.startUrl)
            self.getAmazonCom(isbn, amazonUrl)
            self.getDangdangCom(isbn, ddUrl)
            self.getDoubanCom(isbn, doubanUrl)
            isbns.append(isbn)

    def getBookInfo(self, isbn, url):
        content = self.getContent(url)
        rNameImage = re.compile('id="ImageShow" alt="(.+?)" src="(.+?)" border=')
        rAuthor = re.compile('<title>.+?/(.+?)-')
        rPrice = re.compile('<span class="PriceCharacter">¥</span>(.+?)</strike>')
        rOther = re.compile('<span class="dark">(.+?)<br />')
        bookName = ''
        bookImage = ''
        author = ''
        price = ''
        other = ''
        m = rNameImage.findall(content)
        for n in m:
            bookName = n[0]
            bookImage = n[1]
        m = rAuthor.findall(content)
        for n in m:
            author = n
        m = rPrice.findall(content)
        for n in m:
            price = n
        m = rOther.findall(content)
        
        for n in m:
            if n.find('ISBN') == -1:
                other = other + n.replace('</span>', '') + '\n'
        fileName = isbn + '_info.txt'
        file = open(fileName,'w')
        file.write(bookName + '\n' + bookImage + '\n作者:' + author + '\n原价:' + price + '\n' + other)
        file.close()

        
    def errorSave(self, error):
        file = open('error.txt','a')
        file.write(error)
        file.close()

    def getAmazonCom(self,isbn,Url):
        content = self.getContent(Url)
        rTotal = re.compile('共有(\d+?)位顾客参与打分')
        m = rTotal.findall(content)
        if m:
            totalPage = (int(m[0]) / 10) + 1
            if totalPage > 10:
                totalPage = 10
        else:
            totalPage = 1
        fileName = isbn + '_Amazon.txt'
        #r = re.compile('<div id="\d+".*?>(.+?)<a href=', re.I|re.U|re.S)
        r = re.compile('<span class="tiny">([-: \d]+?)</span>.+?<div id="\d+".*?>(.+?)<a href=', re.I|re.U|re.S)
        moreCommPage = '<a target="_blank" href="http://click.linktech.cn/?m=joyo&a=A100013618&l=99999&l_cd1=0&l_cd2=1&tu=' \
                           + urllib.quote(Url) + '">亚马逊详细评论</a>'
        for i in range(1, totalPage+1):
            content = self.getContent(Url + '&page=' + str(i))
            m = r.findall(content)
            file = open(fileName,'a')
            
            for n in m:
                file.write('<span class="userN">亚马逊网友</span><br><br>' + n[1].strip() + \
                           moreCommPage + '<br><br>' + n[0] +'\n**********\n')
                moreCommPage = ''
                
            file.close()
            print Url + '&page=' + str(i) + 'save sucessfully.'


    def getDangdangCom(self,isbn,Url):
        content = self.getContent(Url)
        rTotal = re.compile('<em>(\d+?)</em>')
        m = rTotal.findall(content)
        if m:
            totalPage = (int(m[0]) / 20) + 1
            if totalPage > 5:
                totalPage = 5
        else:
            totalPage = 1
        fileName = isbn + '_Dangdang.txt'
        #r = re.compile("<div class='center_border'><div class='appear_time'>.+?</div><p>(.+?)</p></div>", re.I|re.U|re.S)
        r = re.compile("<div class='center_border'><div class='appear_time'>(.+?)</div><p>(.+?)</p></div>", re.I|re.U|re.S)
        moreCommPage = '<a target="_blank" href="http://click.linktech.cn/?m=dangdang&a=A100013618&l=99999&l_cd1=0&l_cd2=1&tu=' + \
                           urllib.quote(Url) + '">' + '当当网详细评论'.decode('utf-8').encode('gbk') + '</a>'
        for i in range(1, totalPage+1):
            content = self.getContent(Url + '&page=' + str(i))
            m = r.findall(content)
            file = open(fileName,'a')
            
            for n in m:
                file.write('<span class="userN">' + '当当网友'.decode('utf-8').encode('gbk') + '</span><br><br>' + \
                           n[1].strip() + moreCommPage + '<br><br>' + n[0] +'\n**********\n')
                moreCommPage = ''
            file.close()
            print Url + '&page=' + str(i) + 'save sucessfully.'
                
    def getDoubanCom(self,isbn,Url):
        content = self.getContent(Url)
        rTotal = re.compile('共(\d+?)条')
        m = rTotal.findall(content)
        if m:
            totalPage = (int(m[0]) / 25) + 1
            if totalPage > 4:
                totalPage = 4
        else:
            totalPage = 1
        fileName = isbn + '_Douban.txt'
        #r = re.compile("<div id='review_\d+?_short'>(.+?)<[aspan]+? class=", re.I|re.U|re.S)
        r = re.compile("<div id='review_\d+?_short'>(.+?)<[as].+? class=\"pl\">(.+?) ", re.I|re.U|re.S)
        moreCommPage = '<a target="_blank" href="' + Url + '">豆瓣网详细评论</a>'
        for i in range(1, totalPage+1):
            content = self.getContent(Url + '?start=' + str((i-1) * 25))
            m = r.findall(content)
            file = open(fileName,'a')
            
            for n in m:
                file.write('<span class="userN">豆瓣网友</span><br><br>' + n[0].strip() + \
                           moreCommPage + '<br><br>' + n[1] + '\n**********\n')
                moreCommPage = ''
                
            file.close()
            print Url + '?start=' + str((i-1) * 25) + 'save sucessfully.'

    

    def getDangdangUrl(self, isbn):
        content = self.getContent('http://search.dangdang.com/search.aspx?selectcatalog=&key=' + isbn + '&search=%CB%D1+%CB%F7&catalog=&SearchFromTop=1')
        r = re.compile("<h2><a href='http://search.dangdang.com/rd.asp.id=(\d+?)&")
        m = r.findall(content)
        for n in m:
            return n
        self.errorSave('can not parse Douban\'s Url at http://search.dangdang.com/search.aspx?selectcatalog=&key=' + isbn + '&search=%CB%D1+%CB%F7&catalog=&SearchFromTop=1 \n')
        return ''

    def getDoubanUrl(self, isbn):
        content = self.getContent('http://www.douban.com/subject_search?search_text=' + isbn )
        r = re.compile('<a href="/subject/(\d+?)/')
        m = r.findall(content)
        for n in m:
            return n
        self.errorSave('can not parse Douban\'s Url at http://www.douban.com/subject_search?search_text=' + isbn + '\n')
        return ''
                
    def getISBN(self,url):
        content = self.getContent(url)
        r = re.compile('条形码:</span>(\d+?)<br />')
        m = r.findall(content)
        for n in m:
            return n
        self.errorSave('can not get ISBN at ' + url + '\n')
        return ''
    
    

        
    def getContent(self,url):
        try:
            openurl = urllib.urlopen(url)
            content = openurl.read()
            openurl.close()
            return content
        except:
            print 'error in geting url' + url
            self.errorSave('error in geting url ' + url + '\n')
            return ''
    


if __name__=='__main__':
    file = open('detailurl.txt','r')
    urlList = file.read().split('\n')
    file.close()
    for i in urlList:
        #print i
        g = GetDangdangDoubanAmazonUrl(i)
        g.controlSpider()
    file = open('isbn.txt','w')
    file.write('\n'.join(isbns))
    file.close()

[/code]

分享:

扫一扫在手机阅读、分享本文

请发表您的评论