编程 Python

Python爬取国外天气预报网站的方法

Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下：

crawl_weather.py如下：

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下：

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python爬取国外天气预报网站的方法

- Author -

speedmancs

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

python二叉树的实现实例

Nov 21 Python

Python多线程编程（三）：threading.Thread类的重要函数和方法

Apr 05 Python

Python 爬取携程所有机票的实例代码

Jun 11 Python

基于Python List的赋值方法

Jun 23 Python

python列表list保留顺序去重的实例

Dec 14 Python

python之pexpect实现自动交互的例子

Jul 25 Python

详解Python打包分发工具setuptools

Aug 05 Python

python实现邮件自动发送

Aug 10 Python

Python包，__init__.py功能与用法分析

Jan 07 Python

详解用Pytest+Allure生成漂亮的HTML图形化测试报告

Mar 31 Python

python构造IP报文实例

May 05 Python

解决Python安装cryptography报错问题

Sep 03 Python

Python实现比较两个文件夹中代码变化的方法

Jul 10 #Python

python简单文本处理的方法

Jul 10 #Python

Python实现把json格式转换成文本或sql文件

Jul 10 #Python

Python中的一些陷阱与技巧小结

Jul 10 #Python

Python中的fileinput模块的简单实用示例

Jul 09 #Python

Python中的anydbm模版和shelve模版使用指南

Jul 09 #Python

python冒泡排序简单实现方法

Jul 09 #Python

You might like

php的正则处理函数总结分析

2008/06/20 PHP

PHP 事件机制(2)

2011/03/23 PHP

Centos7.7 64位利用本地完整安装包安装lnmp/lamp套件教程

2021/03/09 Servers

javascript 树控件比较好用

2009/06/11 Javascript

JQuery Highcharts 动态生成图表的方法

2013/11/15 Javascript

分享Javascript中最常用的55个经典小技巧

2013/11/29 Javascript

JS的参数传递示例介绍

2014/02/08 Javascript

jquery控制表单输入框显示默认值的方法

2015/05/22 Javascript

node.js入门实例helloworld详解

2015/12/23 Javascript

Javascript 函数的四种调用模式

2016/11/05 Javascript

分享bootstrap学习笔记心得(组件及其属性)

2017/01/11 Javascript

Vue 2中ref属性的使用方法及注意事项

2017/06/12 Javascript

JS 中使用Promise 实现红绿灯实例代码(demo)

2017/10/20 Javascript

JavaScript实现二叉树的先序、中序及后序遍历方法详解

2017/10/26 Javascript

element-ui组件table实现自定义筛选功能的示例代码

2019/03/15 Javascript

node微信开发之获取access_token+自定义菜单

2019/03/17 Javascript

实例分析javascript中的异步

2020/06/02 Javascript

vue同个按钮控制展开和折叠同个事件操作

2020/07/29 Javascript

小程序实现上传视频功能

2020/08/18 Javascript

python 输出一个两行字符的变量

2009/02/05 Python

如何在Python中编写并发程序

2016/02/27 Python

Python如何判断数独是否合法

2016/09/08 Python

python使用两种发邮件的方式smtp和outlook示例

2017/06/02 Python

Python调用.NET库的方法步骤

2019/12/27 Python

django model通过字典更新数据实例

2020/04/01 Python

支持IE8的纯css3开发的响应式设计动画菜单教程

2014/11/05 HTML / CSS

JBL澳大利亚官方商店：扬声器、耳机和音响系统

2018/05/24 全球购物

党员个人思想汇报

2013/12/28 职场文书

工程管理专业毕业生自荐信

2014/01/24 职场文书

学雷锋月活动总结

2014/04/25 职场文书

2015年团队工作总结范文

2015/05/04 职场文书

实习证明模板

2015/06/16 职场文书

创业计划书之暑假培训班

2019/11/09 职场文书

Nginx URL重写rewrite机制原理及使用实例

2021/04/01 Servers

一文搞懂python异常处理、模块与包

2021/06/26 Python

国产动画《万圣街》日语配音版制作决定！

2022/03/20 国漫