Python爬取国外天气预报网站的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中国", "北京", "zh")
  # Location(True, "", "亚洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下:

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述对大家的python程序设计有所帮助。

Python 相关文章推荐
python之wxPython应用实例
Sep 28 Python
12步入门Python中的decorator装饰器使用方法
Jun 20 Python
Python Flask-web表单使用详解
Nov 18 Python
Python2.7基于笛卡尔积算法实现N个数组的排列组合运算示例
Nov 23 Python
Python+树莓派+YOLO打造一款人工智能照相机
Jan 02 Python
微信小程序跳一跳游戏 python脚本跳一跳刷高分技巧
Jan 04 Python
Windows下Python3.6安装第三方模块的方法
Nov 22 Python
使用 python pyautogui实现鼠标键盘控制功能
Aug 04 Python
详解Python中的format格式化函数的使用方法
Nov 20 Python
django 实现后台从富文本提取纯文本
Jul 02 Python
使用python批量修改XML文件中图像的depth值
Jul 22 Python
在pyCharm中下载第三方库的方法
Apr 18 Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
python简单文本处理的方法
Jul 10 #Python
Python实现把json格式转换成文本或sql文件
Jul 10 #Python
Python中的一些陷阱与技巧小结
Jul 10 #Python
Python中的fileinput模块的简单实用示例
Jul 09 #Python
Python中的anydbm模版和shelve模版使用指南
Jul 09 #Python
python冒泡排序简单实现方法
Jul 09 #Python
You might like
一棵php的类树(支持无限分类)
2006/10/09 PHP
PHP zip扩展Linux下安装过程分享
2014/05/05 PHP
Laravel 5 框架入门(四)完结篇
2015/04/09 PHP
使用PHP实现微信摇一摇周边红包
2016/01/04 PHP
php利用array_search与array_column实现二维数组查找
2019/07/08 PHP
jQuery ajax BUG:object doesn't support this property or method
2010/07/06 Javascript
extjs3 combobox取value和text案例详解
2013/02/06 Javascript
js 遍历json返回的map内容示例代码
2013/10/29 Javascript
html的DOM中document对象forms集合用法实例
2015/01/21 Javascript
JSON格式的键盘编码对照表
2015/01/29 Javascript
用js动态添加html元素,以及属性的简单实例
2016/07/19 Javascript
浅谈js中几种实用的跨域方法原理详解
2016/12/02 Javascript
详解使用JS如何制作简单的ASCII图与单极图
2017/03/31 Javascript
Nuxt.js踩坑总结分享
2018/01/18 Javascript
bootstrap fileinput插件实现预览上传照片功能
2018/01/23 Javascript
vue history 模式打包部署在域名的二级目录的配置指南
2019/07/02 Javascript
详细分析Node.js 多进程
2020/06/22 Javascript
[45:17]DOTA2-DPC中国联赛定级赛 Phoenix vs DLG BO3第三场 1月9日
2021/03/11 DOTA
浅谈Python 的枚举 Enum
2017/06/12 Python
Python安装lz4-0.10.1遇到的坑
2018/05/20 Python
用uWSGI和Nginx部署Flask项目的方法示例
2019/05/05 Python
深入了解Python枚举类型的相关知识
2019/07/09 Python
在Matplotlib图中插入LaTex公式实例
2020/04/17 Python
浅析NumPy 切片和索引
2020/09/02 Python
python中的测试框架
2020/11/13 Python
出门问问全球官方商城:Tichome音箱和TicWatch智能手表
2017/12/02 全球购物
美国球迷装备的第一来源:FOCO
2020/07/03 全球购物
电子商务网站的创业计划书
2014/01/05 职场文书
数控技校生自我鉴定
2014/04/19 职场文书
车辆委托书范本
2014/10/05 职场文书
商业用房租赁协议书
2014/10/13 职场文书
2015公务员试用期工作总结
2014/12/12 职场文书
浅析MongoDB之安全认证
2021/06/26 MongoDB
SQL注入详解及防范方法
2021/12/06 MySQL
3050和2060哪个好 性能差多少 差距有多大 谁更有性价比
2022/06/17 数码科技
数据设计之权限的实现
2022/08/05 MySQL