Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
ptyhon实现sitemap生成示例
Mar 30 Python
python查找目录下指定扩展名的文件实例
Apr 01 Python
Python查询阿里巴巴关键字排名的方法
Jul 08 Python
python中print的不换行即时输出的快速解决方法
Jul 20 Python
今天 平安夜 Python 送你一顶圣诞帽 @微信官方
Dec 25 Python
python读取Excel表格文件的方法
Sep 02 Python
Python 类方法和实例方法(@classmethod),静态方法(@staticmethod)原理与用法分析
Sep 20 Python
Python 输出详细的异常信息(traceback)方式
Apr 08 Python
Python虚拟环境库virtualenvwrapper安装及使用
Jun 17 Python
python中判断文件结束符的具体方法
Aug 04 Python
pycharm 多行批量缩进和反向缩进快捷键介绍
Jan 15 Python
Python中time与datetime模块使用方法详解
Mar 31 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
PHP学习之数组值的操作
2011/04/17 PHP
用 Composer构建自己的 PHP 框架之基础准备
2014/10/30 PHP
php使用str_replace实现输入框回车替换br的方法
2014/11/24 PHP
php array_values 返回数组的值实例详解
2016/11/17 PHP
ExtJS 2.0实用简明教程之应用ExtJS
2009/04/29 Javascript
ajax 缓存 问题 requestheader
2010/08/01 Javascript
JavaScript版DateAdd和DateDiff函数代码
2012/03/01 Javascript
js函数调用常用方法详解
2012/12/03 Javascript
JS OffsetParent属性深入解析
2014/01/13 Javascript
浅谈Unicode与JavaScript的发展史
2015/01/19 Javascript
原生js和jQuery写的网页选项卡特效对比
2015/04/27 Javascript
javascript弹出拖动窗口
2015/08/11 Javascript
BootStrap中Datepicker控件带中文的js文件
2016/08/10 Javascript
JavaScript的new date等日期函数在safari中遇到的坑
2016/10/24 Javascript
javascript实现下雨效果
2017/03/27 Javascript
vue中Npm run build 根据环境传递参数方法来打包不同域名
2018/03/29 Javascript
vue + webpack如何绕过QQ音乐接口对host的验证详解
2018/07/01 Javascript
Vue脚手架的简单使用实例
2018/07/10 Javascript
js布局实现单选按钮控件
2020/01/17 Javascript
vue中利用iscroll.js解决pc端滚动问题
2020/02/15 Javascript
[04:28]2014DOTA2国际邀请赛 采访小兔子LGD挺进钥匙体育馆
2014/07/14 DOTA
Python使用random和tertools模块解一些经典概率问题
2015/01/28 Python
Python的Flask框架应用调用Redis队列数据的方法
2016/06/06 Python
python中对数据进行各种排序的方法
2019/07/02 Python
python实现kNN算法识别手写体数字的示例代码
2019/08/16 Python
Python标准库itertools的使用方法
2020/01/17 Python
Python中Selenium库使用教程详解
2020/07/23 Python
“型”走纽约上东区:Sam Edelman
2017/04/02 全球购物
设计师珠宝:Ylang 23
2018/05/11 全球购物
预备党员党课思想汇报
2014/01/13 职场文书
竞选班干部演讲稿300字
2014/08/20 职场文书
党员干部反四风对照检查材料思想汇报
2014/09/14 职场文书
村长反四风问题个人对照检查材料
2014/09/21 职场文书
乡镇领导班子四风对照检查材料
2014/09/27 职场文书
2014年客户经理工作总结
2014/11/20 职场文书
会议接待欢迎词范文
2015/01/26 职场文书