编程 Python

python抓取网页中图片并保存到本地

Posted in Python onDecember 01, 2015

在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法，感兴趣的朋友可以点击了解详情。

#-*-coding:utf-8-*- 
import os
import uuid
import urllib2
import cookielib
'''获取文件后缀名'''
def get_file_extension(file): 
  return os.path.splitext(file)[1] 
'''??建文件目录，并返回该目录'''
def mkdir(path):
  # 去除左右两边的空格
  path=path.strip()
  # 去除尾部 \符号
  path=path.rstrip("\\")
  if not os.path.exists(path):
    os.makedirs(path)
  return path
'''自动生成一个唯一的字符串，固定长度为36'''
def unique_str():
  return str(uuid.uuid1())
'''
抓取网页文件内容，保存到内存
@url 欲抓取文件 ，path+filename
'''
def get_file(url):
  try:
    cj=cookielib.LWPCookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    req=urllib2.Request(url)
    operate=opener.open(req)
    data=operate.read()
    return data
  except BaseException, e:
    print e
    return None
'''
保存文件到本地
@path 本地路径
@file_name 文件名
@data 文件内容
'''
def save_file(path, file_name, data):
  if data == None:
    return
  mkdir(path)
  if(not path.endswith("/")):
    path=path+"/"
  file=open(path+file_name, "wb")
  file.write(data)
  file.flush()
  file.close()
#获取文件后缀名
print get_file_extension("123.jpg");
#??建文件目录，并返回该目录
#print mkdir("d:/ljq")
#自动生成一个唯一的字符串，固定长度为36
print unique_str()
url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0";
save_file("d:/ljq/", "123.jpg", get_file(url))

通过Python抓取指定Url中的图片保存至本地

# *** encoding: utf-8 ***
__author__='jiangyt'
""" 
fetch images from specific url
v1.0
""" 
import urllib, httplib, urlparse 
import re 
import random 
"""judge url exists or not""" 
def httpExists(url): 
  host, path = urlparse.urlsplit(url)[1:3] 
  if ':' in host: 
    # port specified, try to use it 
    host, port = host.split(':', 1) 
    try: 
      port = int(port) 
    except ValueError: 
      print 'invalid port number %r' % (port,) 
      return False 
  else: 
    # no port specified, use default port 
    port = None 
  try: 
    connection = httplib.HTTPConnection(host, port=port) 
    connection.request("HEAD", path) 
    resp = connection.getresponse( ) 
    if resp.status == 200: # normal 'found' status 
      found = True 
    elif resp.status == 302: # recurse on temporary redirect 
      found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) 
    else: # everything else -> not found 
      print "Status %d %s : %s" % (resp.status, resp.reason, url) 
      found = False 
  except Exception, e: 
    print e.__class__, e, url 
    found = False 
  return found 
"""get html src,return lines[]""" 
def gGetHtmlLines(url): 
  if url==None : return 
  if not httpExists(url): return 
  try: 
    page = urllib.urlopen(url) 
    html = page.readlines() 
    page.close() 
    return html 
  except Exception, e: 
    print "gGetHtmlLines() error! Exception ==>>" + e 
    return 
"""get html src,return string""" 
def gGetHtml(url): 
  if url==None : return 
  if not httpExists(url): return 
  try: 
    page = urllib.urlopen(url) 
    html = page.read() 
    page.close() 
    return html 
  except Exception, e: 
    print "gGetHtml() error! Exception ==>>" + e 
    return 
"""根据url获取文件名""" 
def gGetFileName(url): 
  if url==None: return None 
  if url=="" : return "" 
  arr=url.split("/") 
  return arr[len(arr)-1] 
"""生成随机文件名""" 
def gRandFilename(type): 
  fname = '' 
  for i in range(16): 
    fname = fname + chr(random.randint(65,90)) 
    fname = fname + chr(random.randint(48,57)) 
  return fname + '.' + type 
"""根据url和其上的link，得到link的绝对地址""" 
def gGetAbslLink(url,link): 
  if url==None or link == None : return 
  if url=='' or link=='' : return url 
  addr = '' 
  if link[0] == '/' : 
    addr = gGetHttpAddr(url) + link 
  elif len(link)>3 and link[0:4] == 'http': 
    addr = link 
  elif len(link)>2 and link[0:2] == '..': 
    addr = gGetHttpAddrFatherAssign(url,link) 
  else: 
    addr = gGetHttpAddrFather(url) + link 
  return addr 
"""根据输入的lines，匹配正则表达式，返回list""" 
def gGetRegList(linesList,regx): 
  if linesList==None : return 
  rtnList=[] 
  for line in linesList: 
    matchs = re.search(regx, line, re.IGNORECASE) 
    if matchs!=None: 
      allGroups = matchs.groups() 
      for foundStr in allGroups: 
        if foundStr not in rtnList: 
          rtnList.append(foundStr) 
  return rtnList 
"""根据url下载文件，文件名参数指定""" 
def gDownloadWithFilename(url,savePath,file): 
  #参数检查，现忽略 
  try: 
    urlopen=urllib.URLopener() 
    fp = urlopen.open(url) 
    data = fp.read() 
    fp.close() 
    file=open(savePath + file,'w+b') 
    file.write(data) 
    file.close() 
  except IOError, error: 
    print "DOWNLOAD %s ERROR!==>>%s" % (url, error) 
  except Exception, e: 
    print "Exception==>>" + e 
"""根据url下载文件，文件名自动从url获取""" 
def gDownload(url,savePath): 
  #参数检查，现忽略 
  fileName = gGetFileName(url) 
  #fileName =gRandFilename('jpg') 
  gDownloadWithFilename(url,savePath,fileName) 
"""根据某网页的url,下载该网页的jpg""" 
def gDownloadHtmlJpg(downloadUrl,savePath): 
  lines= gGetHtmlLines(downloadUrl) # 'get the page source' 
  regx = r"""src\s*="?(\S+)\.jpg""" 
  lists =gGetRegList(lines,regx) #'get the links which match regular express' 
  if lists==None: return 
  for jpg in lists: 
    jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' 
    gDownload(jpg,savePath) 
    print gGetFileName(jpg) 
"""根据url取主站地址""" 
def gGetHttpAddr(url): 
  if url== '' : return '' 
  arr=url.split("/") 
  return arr[0]+"//"+arr[2] 
"""根据url取上级目录""" 
def gGetHttpAddrFather(url): 
  if url=='' : return '' 
  arr=url.split("/") 
  addr = arr[0]+'//'+arr[2]+ '/' 
  if len(arr)-1>3 : 
    for i in range(3,len(arr)-1): 
      addr = addr + arr[i] + '/' 
  return addr 
"""根据url和上级的link取link的绝对地址""" 
def gGetHttpAddrFatherAssign(url,link): 
  if url=='' : return '' 
  if link=='': return '' 
  linkArray=link.split("/") 
  urlArray = url.split("/") 
  partLink ='' 
  partUrl = '' 
  for i in range(len(linkArray)): 
    if linkArray[i]=='..': 
      numOfFather = i + 1 #上级数 
    else: 
      partLink = partLink + '/' + linkArray[i] 
  for i in range(len(urlArray)-1-numOfFather): 
    partUrl = partUrl + urlArray[i] 
    if i < len(urlArray)-1-numOfFather -1 : 
      partUrl = partUrl + '/' 
  return partUrl + partLink 
"""根据url获取其上的相关htm、html链接，返回list""" 
def gGetHtmlLink(url): 
  #参数检查，现忽略 
  rtnList=[] 
  lines=gGetHtmlLines(url) 
  regx = r"""href="?(\S+)\.htm""" 
  for link in gGetRegList(lines,regx): 
    link = gGetAbslLink(url,link) + '.htm' 
    if link not in rtnList: 
      rtnList.append(link) 
      print link 
  return rtnList 
"""根据url，抓取其上的jpg和其链接htm上的jpg""" 
def gDownloadAllJpg(url,savePath): 
  #参数检查，现忽略 
  gDownloadHtmlJpg(url,savePath) 
  #抓取link上的jpg 
  links=gGetHtmlLink(url) 
  for link in links: 
    gDownloadHtmlJpg(link,savePath) 
"""test""" 
def main(): 
  u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址
  save='/root/python/tmp/' #图片所要存放的目录
  print 'download pic from [' + u +']' 
  print 'save to [' +save+'] ...' 
  gDownloadHtmlJpg(u,save) 
  print "download finished" 
if __name__ == "__main__":
  main()
else:
  print "called from intern."

以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容，希望大家喜欢。

python抓取网页中图片并保存到本地

- Author -

Ruthless

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

简单谈谈Python中的元祖（Tuple）和字典（Dict）

Apr 21 Python

Python操作MySQL模拟银行转账

Mar 12 Python

python中实现数组和列表读取一列的方法

Apr 03 Python

python计算两个数的百分比方法

Jun 29 Python

Selenium控制浏览器常见操作示例

Aug 13 Python

python样条插值的实现代码

Dec 17 Python

Python3.6实现带有简单界面的有道翻译小程序

Apr 16 Python

Django实现web端tailf日志文件功能及实例详解

Jul 28 Python

Python 50行爬虫抓取并处理图灵书目过程详解

Sep 20 Python

利用python3筛选excel中特定的行（行值满足某个条件/行值属于某个集合）

Sep 04 Python

利用Pycharm + Django搭建一个简单Python Web项目的步骤

Oct 22 Python

详解python模块pychartdir安装及导入问题

Oct 22 Python

利用Python学习RabbitMQ消息队列

Nov 30 #Python

MySQL中表的复制以及大型数据表的备份教程

Nov 25 #Python

python基础知识小结之集合

Nov 25 #Python

python 多线程实现检测服务器在线情况

Nov 25 #Python

Python中time模块与datetime模块在使用中的不同之处

Nov 24 #Python

简单解决Python文件中文编码问题

Nov 22 #Python

Python制作简单的网页爬虫

Nov 22 #Python

You might like

PHP 上传文件大小限制

2009/07/05 PHP

php把数组值转换成键的方法

2015/07/13 PHP

php flush无效,IIS7下php实时输出的方法

2016/08/25 PHP

关于ThinkPhp 框架表单验证及ajax验证问题

2017/07/19 PHP

php使用 readfile() 函数设置文件大小大小的方法

2017/08/11 PHP

从JavaScript 到 JQuery (1)学习小结

2009/02/12 Javascript

jquery实现控制表格行高亮实例

2013/06/05 Javascript

jQuery焦点图插件SaySlide

2015/12/21 Javascript

基于jQuery实现多标签页切换的效果(web前端开发)

2016/07/24 Javascript

JS实现保留n位小数的四舍五入问题示例

2016/08/03 Javascript

认识less和webstrom的less配置方法

2017/08/02 Javascript

ES6学习教程之块级作用域详解

2017/10/09 Javascript

js实现鼠标移动到图片产生遮罩效果

2017/10/21 Javascript

Node使用Sequlize连接Mysql报错：Access denied for user ‘xxx’@‘localhost’

2018/01/03 Javascript

Bootstrap 模态框自定义点击和关闭事件详解

2018/08/10 Javascript

layui加载表格,绑定新增,编辑删除,查看按钮事件的例子

2019/09/06 Javascript

基于canvasJS在PHP中制作动态图表

2020/05/30 Javascript

微信小程序之高德地图多点路线规划过程示例详解

2021/01/18 Javascript

[00:32]10月24、25日辉夜杯外卡赛附加赛开赛！

2015/10/23 DOTA

[01:02:06]LGD vs Mineski Supermajor 胜者组 BO3 第二场 6.5

2018/06/06 DOTA

[01:03:56]Mineski vs TNC 2018国际邀请赛淘汰赛BO1 8.21

2018/08/22 DOTA

go语言计算两个时间的时间差方法

2015/03/13 Python

python实现unicode转中文及转换默认编码的方法

2017/04/29 Python

浅谈function(函数)中的动态参数

2017/04/30 Python

Python中矩阵库Numpy基本操作详解

2017/11/21 Python

tensorflow实现softma识别MNIST

2018/03/12 Python

TensorFlow实现卷积神经网络

2018/05/24 Python

IdealFit官方网站：女性蛋白质、补充剂和运动服装

2019/03/24 全球购物

Vita Fede官网：在意大利手工制作，在纽约市设计

2019/10/25 全球购物

会计毕业生自我鉴定

2013/11/04 职场文书

同学会邀请书大全

2014/01/12 职场文书

组织关系转移介绍信

2014/01/16 职场文书

四风问题查摆材料

2014/08/25 职场文书

用php如何解决大文件分片上传问题

2021/07/07 PHP

Windows11里微软已经将驱动程序安装位置A盘删除

2021/11/21 数码科技

光之国的四大叛徒：第一贝利亚导致宇宙毁灭，赛文奥特曼在榜

2022/03/18 日漫