Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
python访问sqlserver示例
Feb 10 Python
Python 创建子进程模块subprocess详解
Apr 08 Python
在Python的web框架中配置app的教程
Apr 30 Python
基于Django框架利用Ajax实现点赞功能实例代码
Aug 19 Python
Python如何发布程序的详细教程
Oct 09 Python
python 在屏幕上逐字显示一行字的实例
Dec 24 Python
详解Python sys.argv使用方法
May 10 Python
Python 中判断列表是否为空的方法
Nov 24 Python
numpy.linalg.eig() 计算矩阵特征向量方式
Nov 29 Python
python词云库wordcloud的使用方法与实例详解
Feb 17 Python
Python ini文件常用操作方法解析
Apr 26 Python
python爬取音频下载的示例代码
Oct 19 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
WIN98下Apache1.3.14+PHP4.0.4的安装
2006/10/09 PHP
基于php导出到Excel或CSV的详解(附utf8、gbk 编码转换)
2013/06/25 PHP
windows下安装php的memcache模块的方法
2015/04/07 PHP
Laravel中log无法写入问题的解决
2017/06/17 PHP
Windows平台实现PHP连接SQL Server2008的方法
2017/07/26 PHP
PHP+MySql实现一个简单的留言板
2020/07/19 PHP
jquery实现的让超出显示范围外的导航自动固定屏幕最顶上
2011/09/22 Javascript
在Javascript中 声明时用&quot;var&quot;与不用&quot;var&quot;的区别
2013/04/15 Javascript
动态加载脚本提升javascript性能
2014/02/24 Javascript
JavaScript对象属性检查、增加、删除、访问操作实例
2015/07/08 Javascript
jquery实现Li滚动时滚动条自动添加样式的方法
2015/08/10 Javascript
js查看一个函数的执行时间实例代码
2015/09/12 Javascript
JavaScript面向对象之私有静态变量实例分析
2016/01/14 Javascript
jQuery实现手机上输入后隐藏键盘功能
2017/01/04 Javascript
微信小程序 picker 组件详解及简单实例
2017/01/10 Javascript
微信小程序遇到修改数据后页面不渲染的问题解决
2017/03/09 Javascript
js指定步长实现单方向匀速运动
2017/07/17 Javascript
VsCode插件整理(小结)
2017/09/14 Javascript
Vue页面骨架屏注入方法
2018/05/13 Javascript
js实现查询商品案例
2020/07/22 Javascript
[02:49]2014DOTA2电竞也是体育项目! 势要把荣誉带回中国!
2014/07/20 DOTA
[56:18]VGJ.S vs Secret 2018国际邀请赛小组赛BO2 第二场 8.16
2018/08/17 DOTA
Python深入学习之内存管理
2014/08/31 Python
Python中的异常处理简明介绍
2015/04/13 Python
在Ubuntu系统下安装使用Python的GUI工具wxPython
2016/02/18 Python
python正则表达式之作业计算器
2016/03/18 Python
python 获取一个值在某个区间的指定倍数的值方法
2018/11/12 Python
Python实现简单石头剪刀布游戏
2021/01/20 Python
PyTorch的torch.cat用法
2020/06/28 Python
AURALog面试题软件测试方面
2013/10/22 面试题
市场营销管理制度
2014/01/29 职场文书
护士岗位职责
2014/02/16 职场文书
聚美优品励志广告词
2014/03/14 职场文书
要账委托书范本
2014/09/15 职场文书
教师师德表现自我评价
2015/03/05 职场文书
使用nginx动态转换图片大小生成缩略图
2021/03/31 Servers