Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
python以环状形式组合排列图片并输出的方法
Mar 17 Python
Python的Asyncore异步Socket模块及实现端口转发的例子
Jun 14 Python
python模拟登录并且保持cookie的方法详解
Apr 04 Python
python的exec、eval使用分析
Dec 11 Python
浅谈Scrapy框架普通反爬虫机制的应对策略
Dec 28 Python
Python用imghdr模块识别图片格式实例解析
Jan 11 Python
Python函数中参数是传递值还是引用详解
Jul 02 Python
pytorch dataloader 取batch_size时候出现bug的解决方式
Feb 20 Python
python统计文章中单词出现次数实例
Feb 27 Python
Python json读写方式和字典相互转化
Apr 18 Python
python statsmodel的使用
Dec 21 Python
Python正则表达式中flags参数的实例详解
Apr 01 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
用PHP实现验证码功能
2006/10/09 PHP
提升PHP执行速度全攻略(上)
2006/10/09 PHP
PHP Curl多线程原理实例详解
2013/11/06 PHP
php多文件上传下载示例分享
2014/02/20 PHP
Yii2中简单的场景使用介绍
2017/06/02 PHP
表单(FORM)的一些实用效果代码
2007/03/25 Javascript
Jquery对select的增、删、改、查操作
2015/02/06 Javascript
js鼠标滑过图片震动特效的方法
2015/02/17 Javascript
AngularJS优雅的自定义指令
2016/07/01 Javascript
如何清除IE10+ input X 文本框的叉叉和密码输入框的眼睛图标
2016/12/21 Javascript
微信小程序开发之入门实例教程篇
2017/03/07 Javascript
JavaScript实现简单精致的图片左右无缝滚动效果
2017/03/16 Javascript
原生JS京东轮播图代码
2017/03/22 Javascript
jQuery ajax读取本地json文件的实例
2017/10/31 jQuery
Node.js学习教程之HTTP/2服务器推送【译】
2017/10/31 Javascript
浅谈JavaScript 代码整洁之道
2018/10/23 Javascript
为什么说JavaScript预解释是一种毫无节操的机制详析
2018/11/18 Javascript
详解vue项目中调用百度地图API使用方法
2019/04/25 Javascript
node.js中path路径模块的使用方法实例分析
2020/02/13 Javascript
Openlayers显示地理位置坐标的方法
2020/09/28 Javascript
Python中生成Epoch的方法
2017/04/26 Python
python爬取微信公众号文章的方法
2019/02/26 Python
Python去除字符串前后空格的几种方法
2019/03/04 Python
Python统计一个字符串中每个字符出现了多少次的方法【字符串转换为列表再统计】
2019/05/05 Python
python函数与方法的区别总结
2019/06/23 Python
基于Python的接口自动化读写excel文件的方法
2021/01/15 Python
AmazeUI 列表的实现示例
2020/08/17 HTML / CSS
护理学中专毕业生求职信
2013/11/11 职场文书
《最大的“书”》教学反思
2014/02/14 职场文书
科级干部考察材料
2014/02/15 职场文书
工会2014法制宣传日活动总结
2014/11/01 职场文书
学校节水倡议书
2015/04/29 职场文书
2019求职信大礼包
2019/05/15 职场文书
Python利器openpyxl之操作excel表格
2021/04/17 Python
python如何利用cv2模块读取显示保存图片
2021/06/04 Python
详解Redis在SpringBoot工程中的综合应用
2021/10/16 Redis