Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
python动态加载变量示例分享
Feb 17 Python
Python实现对PPT文件进行截图操作的方法
Apr 28 Python
python fabric使用笔记
May 09 Python
代码分析Python地图坐标转换
Feb 08 Python
python中使用PIL制作并验证图片验证码
Mar 15 Python
关于pymysql模块的使用以及代码详解
Sep 01 Python
python3读取csv文件任意行列代码实例
Jan 13 Python
Python中 Global和Nonlocal的用法详解
Jan 20 Python
解决TensorFlow模型恢复报错的问题
Feb 06 Python
python实现井字棋小游戏
Mar 04 Python
python求numpy中array按列非零元素的平均值案例
Jun 08 Python
pyqt5 textEdit、lineEdit操作的示例代码
Aug 12 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
PHP实现的简单三角形、矩形周长面积计算器分享
2014/11/18 PHP
PHP实现股票趋势图和柱形图
2015/02/07 PHP
PHP开发中AJAX技术的简单应用
2015/12/11 PHP
Laravel如何创建服务器提供者实例代码
2019/04/15 PHP
jQuery Ajax文件上传(php)
2009/06/16 Javascript
利用jQuery接受和处理xml数据的代码(.net)
2011/03/28 Javascript
使用js实现按钮控制文本框加1减1应用于小时+分钟
2013/12/09 Javascript
JS限制文本框只能输入数字和字母方法
2015/02/28 Javascript
Javascript设计模式之观察者模式的多个实现版本实例
2015/03/03 Javascript
AngularJS压缩JS技巧分析
2016/11/08 Javascript
vue.js树形组件之删除双击增加分支实例代码
2017/02/28 Javascript
BootStrap表单时间选择器详解
2017/05/09 Javascript
jQuery模拟实现天猫购物车动画效果实例代码
2017/05/25 jQuery
详解Node.js中的Async和Await函数
2018/02/22 Javascript
关于在vue 中使用百度ueEditor编辑器的方法实例代码
2018/09/14 Javascript
微信小程序实现多选功能
2018/11/04 Javascript
JS滚轮控制图片缩放大小和拖动的实例代码
2018/11/20 Javascript
js实现浏览器打印功能的示例代码
2020/07/15 Javascript
Vue使用screenfull实现全屏效果
2020/09/17 Javascript
使用Python编写Linux系统守护进程实例
2015/02/03 Python
Flask框架工厂函数用法实例分析
2019/05/25 Python
使用Python串口实时显示数据并绘图的例子
2019/12/26 Python
利用Python代码实现一键抠背景功能
2019/12/29 Python
Python依赖包迁移到断网环境操作
2020/07/13 Python
Python如何执行精确的浮点数运算
2020/07/31 Python
python简单利用字典破解zip文件口令
2020/09/07 Python
python SOCKET编程基础入门
2021/02/27 Python
中国跨镜手机配件批发在线商店:TVC-Mall
2019/08/20 全球购物
FC-Moto美国:欧洲最大的摩托车服装和头盔商店之一
2019/08/24 全球购物
给酒店员工的表扬信
2014/01/11 职场文书
关于期中考试的反思
2014/02/02 职场文书
初中英语教师个人工作总结2015
2015/07/21 职场文书
关于元旦的广播稿2016
2015/12/17 职场文书
外出学习心得体会范文
2016/01/18 职场文书
干部作风纪律整顿心得体会
2016/01/23 职场文书
python 多态 协议 鸭子类型详解
2021/11/27 Python