Python-实现网络爬虫-蜘蛛.doc_文客久久网wenke99.com

资源描述

1、#*python 中如何提取网页正文啊谢谢import urllib.request url=“http:/ response=urllib.request.urlopen(url) page=response.read() python 提取网页中的文本1. import os,sys,datetime 2. import httplib,urllib, re 3. from sgmllib import SGMLParser 4. 5. import types 6. 7. class Html2txt(SGMLParser): 8. def reset(self): 9. self.t

2、ext = 10. self.inbody = True 11. SGMLParser.reset(self) 12. def handle_data(self,text): 13. if self.inbody: 14. self.text += text 15. 16. def start_head(self,text): 17. self.inbody = False 18. def end_head(self): 19. self.inbody = True 20. 21. 22.if _name_ = “_main_“: 23. parser = Html2txt() 24. par

3、ser.feed(urllib.urlopen(“http:/“).read() 25. parser.close() 26. print parser.text.strip() python 下载网页import httplib conn=httplib.HTTPConnection(““)conn.request(“GET“,“/index.html“)r1=conn.getresponse()print r1.status,r1.reason#*data=r1.read()print dataconn.close用 python 下载网页，超级简单！from urllib import

4、urlopenwebdata = urlopen(“).read()print webdata深入 python 里面有python 下载网页内容, 用 python 的 pycurl 模块实现1. 用 python 下载网页内容还是很不错的，之前是使用 urllib模块实验的，但听说有 pycurl这个模块，而且比 urllib好，所以尝试下，废话不说，以下是代码2.3.4. #!/usr/bin/env python5. # -*- coding: utf-8 -*-6. import StringIO7. import pycurl8.9. def writefile(fstr,xfil

5、ename):f=open(xfilename,w)f.write(fstr)f.close10.1. html = StringIO.StringIO()2. c = pycurl.Curl()#*3. myurl=http:/4. 5. c.setopt(pycurl.URL, myurl)6. 7. #写的回调8. c.setopt(pycurl.WRITEFUNCTION, html.write)9. 10. c.setopt(pycurl.FOLLOWLOCATION, 1)11. 12. #最大重定向次数,可以预防重定向陷阱13. c.setopt(pycurl.MAXREDIRS

6、, 5)14. 15. #连接超时设置16. c.setopt(pycurl.CONNECTTIMEOUT, 60)17. c.setopt(pycurl.TIMEOUT, 300)18. 19. #模拟浏览器20. c.setopt(pycurl.USERAGENT, “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)“)21. 22. 23. 24. #访问,阻塞到访问结束25. c.perform()26. 27. #打印出 200(HTTP 状态码，可以不需要)28. print c.g

7、etinfo(pycurl.HTTP_CODE)29. 30. #输出网页的内容#*31. print html.getvalue()32. #保存成 down.txt文件33. writefile(html.getvalue(),“down.txt“)python的 pycurl模块的安装可以到http:/ 下载网页的几种方法1fd = urllib2.urlopen(url_link)data = fd.read()这是最简洁的一种，当然也是 Get 的方法2通过 GET 的方法def GetHtmlSource(url):try:htmSource = req = urllib2.Req

8、uest(url)fd = urllib2.urlopen(req,“)while 1:data = fd.read(1024)if not len(data):breakhtmSource += datafd.close()del fd#*del reqhtmSource = htmSource.decode(cp936)htmSource = formatStr(htmSource)return htmSourceexcept socket.error, err:str_err = “%s“ % errreturn “3通过 GET 的方法def GetHtmlSource_Get(htm

9、url):htmSource = “try:urlx = httplib.urlsplit(htmurl)conn = httplib.HTTPConnection(loc)conn.connect()conn.putrequest(“GET“, htmurl, None)conn.putheader(“Content-Length“, 0)conn.putheader(“Connection“, “close“)conn.endheaders()res = conn.getresponse()htmSource = res.read()except Exception(), err:trac

10、kback.print_exec()conn.close()return htmSource#*通过 POST 的方法def GetHtmlSource_Post(getString):htmSource = “try:url = httplib.urlsplit(“http:/:8080“)conn = httplib.HTTPConnection(loc)conn.connect()conn.putrequest(“POST“, “/sipo/zljs/hyjs-jieguo.jsp“)conn.putheader(“Content-Length“, len(getString)conn.

11、putheader(“Content-Type“, “application/x-www-form-urlencoded“)conn.putheader(“Connection“, “ Keep-Alive“)conn.endheaders()conn.send(getString)f = conn.getresponse()if not f:raise socket.error, “timed out“htmSource = f.read()f.close()conn.close()return htmSourceexcept Exception(), err:trackback.print

12、_exec()conn.close()return htmSource本文来自 CSDN 博客，转载请标明出处：http:/ 组合的垂直搜索爬虫使用 python+BeautifulSoup 完成爬虫抓取特定数据的工作，并使用 Django 搭建一个管理平台，用来协调抓取工作。#*因为自己很喜欢 Django admin 后台，所以这次用这个后台对抓取到的链接进行管理，使我的爬虫可以应对各种后期的需求。比如分时段抓取，定期的对已经抓取的地址重新抓取。数据库是用 python 自带的 sqlite3，所以很方便。这几天正好在做一个电影推荐系统，需要些电影数据。本文的例子是对豆瓣电影抓取特定的数据

13、。第一步：建立 Django 模型模仿 nutch 的爬虫思路，这里简化了。每次抓取任务开始先从数据库里找到未保存的(is_save = False)的链接，放到抓取链表里。你也可以根据自己的需求去过滤链接。python 代码：view plaincopy to clipboardprint?01.class Crawl_URL(models.Model): 02. url = models.URLField(抓取地址,max_length=100, unique=True) 03. weight = models.SmallIntegerField(抓取深度,default = 0)#抓取深

14、度起始 1 04. is_save = models.BooleanField(是否已保存,default= False)# 05. date = models.DateTimeField(保存时间,auto_now_add=True,blank=True,null=True) 06. def _unicode_(self): 07. return self.url class Crawl_URL(models.Model):url = models.URLField(抓取地址 ,max_length=100, unique=True)weight = models.SmallIntegerF

15、ield(抓取深度,default = 0)#抓取深度起始 1is_save = models.BooleanField(是否已保存,default= False)#date = models.DateTimeField(保存时间,auto_now_add=True,blank=True,null=True)def _unicode_(self):return self.url 然后生成相应的表。还需要一个 admin 管理后台view plaincopy to clipboardprint?01.class Crawl_URLAdmin(admin.ModelAdmin): 02. list

16、_display = (url,weight,is_save,date,) 03. ordering = (-id,) 04. list_filter = (is_save,weight,date,) 05. fields = (url,weight,is_save,) #*06.admin.site.register(Crawl_URL, Crawl_URLAdmin) class Crawl_URLAdmin(admin.ModelAdmin):list_display = (url,weight,is_save,date,)ordering = (-id,)list_filter = (

17、is_save,weight,date,)fields = (url,weight,is_save,)admin.site.register(Crawl_URL, Crawl_URLAdmin) 第二步，编写爬虫代码爬虫是单线程，并且每次抓取后都有相应的暂定，豆瓣网会禁止一定强度抓取的爬虫爬虫根据深度来控制，每次都是先生成链接，然后抓取，并解析出更多的链接，最后将抓取过的链接 is_save=true，并把新链接存入数据库中。每次一个深度抓取完后都需要花比较长的时候把链接导入数据库。因为需要判断链接是否已存入数据库。这个只对满足正则表达式 http:/ 的地址进行数据解析。并且直接忽略掉不是电

18、影模块的链接。第一次抓取需要在后台加个链接，比如 http:/ 代码：#这段代码不能格式化发# coding=UTF-8import urllib2from BeautifulSoup import *from urlparse import urljoinfrom pysqlite2 import dbapi2 as sqlitefrom movie.models import *from django.contrib.auth.models import Userfrom time import sleepimage_path = C:/Users/soul/djcodetest/pict

19、ure/user = User.objects.get(id=1)def crawl(depth=10):for i in range(1,depth):print 开始抓取 for %d.%ipages = Crawl_URL.objects.filter(is_save=False)newurls= #*for crawl_page in pages:page = crawl_page.urltry:c=urllib2.urlopen(page)except:continue try:#解析元数据和 urlsoup=BeautifulSoup(c.read()#解析电影页面if re.se

20、arch(rhttp:/ newurlslinks=soup(a)for link in links: if href in dict(link.attrs): url=urljoin(page,linkhref) if url.find(“)!=-1: continueif len(url) 60: continueurl=url.split(#)0 # removie location portionif re.search(rhttp:/, url):newurlsurl= crawl_page.weight + 1 #连接有效。存入字典中try:print add url :excep

21、t:pass except Exception.args:try:print “Could not parse : %s“ % argsexcept:pass#newurls 存入数据库 is_save=False weight=icrawl_page.is_save = Truecrawl_page.save()#休眠 2.5 秒sleep(2.5)save_url(newurls) #保存 url，放到数据库里def save_url(newurls):for (url,weight) in newurls.items():url = Crawl_URL(url=url,weight=we

22、ight)try:url.save()except:#*try:print url 重复:except:passreturn True第三步，用 BeautifulSoup 解析页面抽取出电影标题，图片，剧情介绍，主演，标签，地区。关于 BeautifulSoup 的使用可以看这里 BeautifulSoup 技术文档view plaincopy to clipboardprint?01.#抓取数据 02.def read_html(soup): 03. #解析出标题 04. html_title = soup.html.head.title.string 05. title = html_t

23、itle:len(html_title)-5 06. #解析出电影介绍 07. try: 08. intro = soup.find(span,attrs=class:all hidden).text 09. except: 10. try: 11. node = soup.find(div,attrs=class:blank20).previousSibling 12. intro = node.contents0+node.contents2 13. except: 14. try: 15. contents = soup.find(div,attrs=class:blank20).pre

24、viousSibling.previousSibling.text 16. intro = contents:len(contents)-22 17. except: 18. intro = u暂无 19. 20. #取得图片 21. html_image = soup(a,href=pile( 22. data = urllib2.urlopen(html_image).read() 23. image = 201003/+html_imagehtml_image.rfind(/)+1: 24. f = file(image_path+image,wb) 25. f.write(data)

25、26. f.close() 27. 28. 29. #解析出地区 30. try: 31. soup_obmo = soup.find(div,attrs=class:obmo).findAll(span) 32. html_area = soup_obmo0.nextSibling.split(/) 33. area = html_area0.lstrip() 34. except: 35. area = 36. 37. #time = soup_obmo1.nextSibling.split( )1 38. #time = time.strptime(html_time,%Y-%m-%d)

26、 39. 40. #生成电影对象 41. new_movie = Movie(title=title,intro=intro,area=area,version=暂无,upload_user=user,image=image) 42. new_movie.save() 43. try: 44. actors = soup.find(div,attrs=id:info).findAll(span)5.nextSibling.nextSibling.string.split( )0 45. actors_list = Actor.objects.filter(name = actors) 46.

27、if len(actors_list) = 1: 47. actor = actors_list0 48. new_movie.actors.add(actor) 49. else: 50. actor = Actor(name=actors) 51. actor.save() 52. new_movie.actors.add(actor) 53. except: 54. pass 55. 56. #tag 57. tags = soup.find(div,attrs=class:blank20).findAll(a) 58. for tag_html in tags: 59. tag_str = tag_html.string 60. if len(tag_str) 4: 61. continue 62. tag_list = Tag.objects.filter(name = tag_str) 63. if

展开阅读全文