Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
Python 自动安装 Rising 杀毒软件
Apr 24 Python
python实现查询IP地址所在地
Mar 29 Python
Python实现约瑟夫环问题的方法
May 03 Python
python模拟Django框架实例
May 17 Python
python 3.6 +pyMysql 操作mysql数据库(实例讲解)
Dec 20 Python
python-docx修改已存在的Word文档的表格的字体格式方法
May 08 Python
Python使用ConfigParser模块操作配置文件的方法
Jun 29 Python
python数据处理——对pandas进行数据变频或插值实例
Apr 22 Python
浅谈Keras参数 input_shape、input_dim和input_length用法
Jun 29 Python
python中time、datetime模块的使用
Dec 14 Python
Jupyter Notebook 如何修改字体和大小以及更改字体样式
Jun 03 Python
基于Python实现一个春节倒计时脚本
Jan 22 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
解析php利用正则表达式解决采集内容排版的问题
2013/06/20 PHP
jQuery中的RadioButton,input,CheckBox取值赋值实现代码
2014/02/18 PHP
打开新窗口关闭当前页面不弹出关闭提示js代码
2013/03/18 Javascript
JavaScript中创建类/对象的几种方法总结
2013/11/29 Javascript
解析Javascript中难以理解的11个问题
2013/12/09 Javascript
ExtJs纵坐标值重复问题的解决方法
2014/02/27 Javascript
jquery中ajax函数执行顺序问题之如何设置同步
2014/02/28 Javascript
JavaScript处理解析JSON数据过程详解
2015/09/11 Javascript
Bootstrap组件学习之导航、标签、面包屑导航(精品)
2016/05/17 Javascript
深入理解js中this的用法
2016/05/28 Javascript
第一次接触神奇的Bootstrap表单
2016/07/27 Javascript
JS弹窗 JS弹出DIV并使整个页面背景变暗功能的实现代码
2018/04/21 Javascript
keep-alive保持组件状态的方法
2020/12/02 Javascript
对Python中type打开文件的方式介绍
2018/04/28 Python
OPENCV去除小连通区域,去除孔洞的实例讲解
2018/06/21 Python
Python matplotlib画图与中文设置操作实例分析
2019/04/23 Python
Python爬虫抓取技术的一些经验
2019/07/12 Python
树莓派使用python-librtmp实现rtmp推流h264的方法
2019/07/22 Python
python 队列基本定义与使用方法【初始化、赋值、判断等】
2019/10/24 Python
Python有参函数使用代码实例
2020/01/06 Python
python3实现raspberry pi(树莓派)4驱小车控制程序
2020/02/12 Python
超全Python图像处理讲解(多模块实现)
2020/04/13 Python
Django用户认证系统如何实现自定义
2020/11/12 Python
python实现KNN近邻算法
2020/12/30 Python
CSS3的transition和animation的用法实例介绍
2014/08/20 HTML / CSS
css3弹性盒模型(Flexbox)详细介绍
2014/10/08 HTML / CSS
HTML5 新表单类型示例代码
2018/03/20 HTML / CSS
采购主管的岗位职责
2013/12/17 职场文书
《雷雨》教学反思
2014/02/20 职场文书
地方课程教学计划
2015/01/19 职场文书
市场部岗位职责范本
2015/04/15 职场文书
2015年行政人事部工作总结
2015/05/13 职场文书
2016继续教育培训学习心得体会
2016/01/19 职场文书
2019年中学生的思想品德评语集锦
2019/12/19 职场文书
jquery插件实现悬浮的菜单
2021/04/24 jQuery
动画「半妖的夜叉姬」新BD特典图公开
2022/03/22 日漫