Posted in Python onFebruary 20, 2014
编写tasks.py
from celery import Celery from tornado.httpclient import HTTPClient app = Celery('tasks') app.config_from_object('celeryconfig') @app.task def get_html(url): http_client = HTTPClient() try: response = http_client.fetch(url,follow_redirects=True) return response.body except httpclient.HTTPError as e: return None http_client.close()
编写celeryconfig.py
CELERY_IMPORTS = ('tasks',) BROKER_URL = 'amqp://guest@localhost:5672//' CELERY_RESULT_BACKEND = 'amqp://'
编写spider.py
from tasks import get_html from queue import Queue from bs4 import BeautifulSoup from urllib.parse import urlparse,urljoin import threading class spider(object): def __init__(self): self.visited={} self.queue=Queue() def process_html(self, html): pass #print(html) def _add_links_to_queue(self,url_base,html): soup = BeautifulSoup(html) links=soup.find_all('a') for link in links: try: url=link['href'] except: pass else: url_com=urlparse(url) if not url_com.netloc: self.queue.put(urljoin(url_base,url)) else: self.queue.put(url_com.geturl()) def start(self,url): self.queue.put(url) for i in range(20): t = threading.Thread(target=self._worker) t.daemon = True t.start() self.queue.join() def _worker(self): while 1: url=self.queue.get() if url in self.visited: continue else: result=get_html.delay(url) try: html=result.get(timeout=5) except Exception as e: print(url) print(e) self.process_html(html) self._add_links_to_queue(url,html) self.visited[url]=True self.queue.task_done() s=spider() s.start("https://3water.com/")
由于html中某些特殊情况的存在,程序还有待完善。
python使用rabbitmq实现网络爬虫示例
声明:登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。
Reply on: @reply_date@
@reply_contents@