Python实现多线程抓取妹子图


Posted in Python onAugust 08, 2015

心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。

Pic_downloader.py

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 07 17:30:58 2015
 
@author: Dreace
"""
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()
def rename():
  return time.strftime("%Y%m%d%H%M%S")
def rename_2(name): 
  if len(name) == 2: 
    name = '0' + name + '.jpg' 
  elif len(name) == 1: 
    name = '00' + name + '.jpg' 
  else: 
    name = name + '.jpg' 
  return name
def download_pic(i):
  global count
  global time_out
  if Filter(i):
    try: 
      content = urllib2.urlopen(i,timeout = time_out)
      url_content = content.read()
      f = open(repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count)),"wb")
      f.write(url_content)
      f.close()
      count += 1
    except Exception, e:
      print i + "下载超时,跳过!".decode("utf-8").encode(type_)
def Filter(content):
  for line in Filter_list:
    line=line.strip('\n')
    if content.find(line) == -1:
      return True
def get_pic(url_address):
  global pic_list
  try:
    str_ = urllib2.urlopen(url_address, timeout = time_out).read()
    url_content = str_.split("\"")
    for i in url_content:
      if i.find(".jpg") != -1:
        pic_list.append(i)  
  except Exception, e:
    print "获取图片超时,跳过!".decode("utf-8").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name = "C:\Photos\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = "http://sexy.faceks.com/?page="
for i in range(1,MAX + 1): 
  page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print "获取到".decode("utf-8").encode(type_),len(pic_list),"张图片,开始下载!".decode("utf-8").encode(type_)
pool = ThreadPool(thread_num) 
pool.map(download_pic,pic_list)
pool.close() 
pool.join()
print count,"张图片保存在".decode("utf-8").encode(type_) + dir_name
print "共耗时".decode("utf-8").encode(type_),time.time() - start_time,"s"

我们来看下一个网友的作品

#coding: utf-8 #############################################################
# File Name: main.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Wed 11 Jun 2014 08:22:12 PM CST
#########################################################################
#!/usr/bin/python

import re,urllib2,HTMLParser,threading,Queue,time

#各图集入口链接
htmlDoorList = []
#包含图片的Hmtl链接
htmlUrlList = []
#图片Url链接Queue
imageUrlList = Queue.Queue(0)
#捕获图片数量
imageGetCount = 0
#已下载图片数量
imageDownloadCount = 0
#每个图集的起始地址,用于判断终止
nextHtmlUrl = ''
#本地保存路径
localSavePath = '/data/1920x1080/'

#如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str = '1920x1080'

replaced_str = '960x600'

#内页分析处理类
class ImageHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.nextUrl = ''
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
global imageUrlList
if(tag == 'img' and len(attrs) > 2 ):
if(attrs[0] == ('id','bigImg')):
url = attrs[1][1]
url = url.replace(replaced_str,replace_str)
imageUrlList.put(url)
global imageGetCount
imageGetCount = imageGetCount + 1
print url
elif(tag == 'a' and len(attrs) == 4):
if(attrs[0] == ('id','pageNext') and attrs[1] == ('class','next')):
global nextHtmlUrl
nextHtmlUrl = attrs[2][1];

#首页分析类
class IndexHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = ''
self.tagList = ['li','a']
self.classList = ['photo-list-padding','pic']
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if(tag == self.tagList[self.index]):
for attr in attrs:
if (attr[1] == self.classList[self.index]):
if(self.index == 0):
#第一层找到了
self.index = 1
else:
#第二层找到了
self.index = 0
print attrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag == 'a'):
for attr in attrs:
if (attr[0] == 'id' and attr[1] == 'pageNext'):
self.nextUrl = attrs[1][1]
print 'nextUrl:',self.nextUrl
break

#首页Hmtl解析器
indexParser = IndexHtmlParser()
#内页Html解析器
imageParser = ImageHtmlParser()

#根据首页得到所有入口链接
print '开始扫描首页...'
host = 'http://desk.zol.com.cn'
indexUrl = '/meinv/'
while (indexUrl != ''):
print '正在抓取网页:',host+indexUrl
request = urllib2.Request(host+indexUrl)
try:
m = urllib2.urlopen(request)
con = m.read()
indexParser.feed(con)
if (indexUrl == indexParser.nextUrl):
break
else:
indexUrl = indexParser.nextUrl
except urllib2.URLError,e:
print e.reason

print '首页扫描完成,所有图集链接已获得:'
htmlDoorList = indexParser.urlList

#根据入口链接得到所有图片的url
class getImageUrl(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for door in htmlDoorList:
print '开始获取图片地址,入口地址为:',door
global nextHtmlUrl
nextHtmlUrl = ''
while(door != ''):
print '开始从网页%s获取图片...'% (host+door)
if(nextHtmlUrl != ''):
request = urllib2.Request(host+nextHtmlUrl)
else:
request = urllib2.Request(host+door)
try:
m = urllib2.urlopen(request)
con = m.read()
imageParser.feed(con)
print '下一个页面地址为:',nextHtmlUrl
if(door == nextHtmlUrl):
break
except urllib2.URLError,e:
print e.reason
print '所有图片地址均已获得:',imageUrlList

class getImage(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global imageUrlList
print '开始下载图片...'
while(True):
print '目前捕获图片数量:',imageGetCount
print '已下载图片数量:',imageDownloadCount
image = imageUrlList.get()
print '下载文件路径:',image
try:
cont = urllib2.urlopen(image).read()
patter = '[0-9]*\.jpg';
match = re.search(patter,image);
if match:
print '正在下载文件:',match.group()
filename = localSavePath+match.group()
f = open(filename,'wb')
f.write(cont)
f.close()
global imageDownloadCount
imageDownloadCount = imageDownloadCount + 1
else:
print 'no match'
if(imageUrlList.empty()):
break
except urllib2.URLError,e:
print e.reason
print '文件全部下载完成...'

get = getImageUrl()
get.start()
print '获取图片链接线程启动:'

time.sleep(2)

download = getImage()
download.start()
print '下载图片链接线程启动:'

批量抓取指定网页上的所有图片

# -*- coding:utf-8 -*-
# coding=UTF-8
 
import os,urllib,urllib2,re
 
url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath = "t:\\"
 
def getHtml(url):
  webfile = urllib.urlopen(url)
  outhtml = webfile.read()
  print outhtml
  return outhtml
 
def getImageList(html):
  restr=ur'('
  restr+=ur'http:\/\/[^\s,"]*\.jpg'
  restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
  restr+=ur'|http:\/\/[^\s,"]*\.png'
  restr+=ur'|http:\/\/[^\s,"]*\.gif'
  restr+=ur'|http:\/\/[^\s,"]*\.bmp'
  restr+=ur'|https:\/\/[^\s,"]*\.jpeg'  
  restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
  restr+=ur'|https:\/\/[^\s,"]*\.png'
  restr+=ur'|https:\/\/[^\s,"]*\.gif'
  restr+=ur'|https:\/\/[^\s,"]*\.bmp'
  restr+=ur')'
  htmlurl = re.compile(restr)
  imgList = re.findall(htmlurl,html)
  print imgList
  return imgList
 
def download(imgList, page):
  x = 1
  for imgurl in imgList:
    filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
    print '[Debug] Download file :'+ imgurl+' >> '+filepathname
    urllib.urlretrieve(imgurl,filepathname)
    x+=1
 
def downImageNum(pagenum):
  page = 1
  pageNumber = pagenum
  while(page <= pageNumber):
    html = getHtml(url)#获得url指向的html内容
    imageList = getImageList(html)#获得所有图片的地址,返回列表
    download(imageList,page)#下载所有的图片
    page = page+1
 
if __name__ == '__main__':
  downImageNum(1)

以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。

Python 相关文章推荐
Python实现扫描指定目录下的子目录及文件的方法
Jul 16 Python
Python编程实现的图片识别功能示例
Aug 03 Python
快速了解python leveldb
Jan 18 Python
pandas groupby 分组取每组的前几行记录方法
Apr 20 Python
django的登录注册系统的示例代码
May 14 Python
Python爬虫包BeautifulSoup简介与安装(一)
Jun 17 Python
python简单实现矩阵的乘,加,转置和逆运算示例
Jul 10 Python
基于python框架Scrapy爬取自己的博客内容过程详解
Aug 05 Python
Python爬虫库BeautifulSoup获取对象(标签)名,属性,内容,注释
Jan 25 Python
python GUI库图形界面开发之PyQt5简单绘图板实例与代码分析
Mar 08 Python
关于python tushare Tkinter构建的简单股票可视化查询系统(Beta v0.13)
Oct 19 Python
Python3利用scapy局域网实现自动多线程arp扫描功能
Jan 21 Python
通过Python来使用七牛云存储的方法详解
Aug 07 #Python
Python爬虫框架Scrapy实战之批量抓取招聘信息
Aug 07 #Python
深入理解Python中命名空间的查找规则LEGB
Aug 06 #Python
举例详解Python中yield生成器的用法
Aug 05 #Python
Python中return语句用法实例分析
Aug 04 #Python
python函数形参用法实例分析
Aug 04 #Python
Python简明入门教程
Aug 04 #Python
You might like
PHP 字符串加密函数(在指定时间内加密还原字符串,超时无法还原)
2010/04/28 PHP
Thinkphp中的volist标签用法简介
2014/06/18 PHP
php5.3/5.4/5.5/5.6/7常见新增特性汇总整理
2020/02/27 PHP
把textarea中字符串里含有的回车换行替换成&amp;lt;br&amp;gt;的javascript代码
2007/04/20 Javascript
Mootools 1.2教程 类(一)
2009/09/15 Javascript
javascript Array对象基础知识小结
2010/11/16 Javascript
onkeypress字符按键兼容所有浏览器使用介绍
2013/04/24 Javascript
轻松学习jQuery插件EasyUI EasyUI创建CRUD应用
2015/11/30 Javascript
使用jQuery+EasyUI实现CheckBoxTree的级联选中特效
2015/12/06 Javascript
JS 日期与时间戮相互转化的简单实例
2016/06/22 Javascript
微信小程序 UI布局常用技巧整理总结
2016/12/05 Javascript
vue组件实现文字居中对齐的方法
2017/08/23 Javascript
React SSR样式及SEO的实践
2018/10/22 Javascript
javascript之分片上传,断点续传的实际项目实现详解
2019/09/05 Javascript
解决包含在label标签下的checkbox在ie8及以下版本点击事件无效果兼容的问题
2019/10/27 Javascript
python 自动化将markdown文件转成html文件的方法
2016/09/23 Python
Python文本相似性计算之编辑距离详解
2016/11/28 Python
tf.truncated_normal与tf.random_normal的详细用法
2018/03/05 Python
对Django中内置的User模型实例详解
2019/08/16 Python
Python递归调用实现数字累加的代码
2020/02/25 Python
Python3开发环境搭建详细教程
2020/06/18 Python
使用CSS3的::selection改变选中文本颜色的方法
2015/09/29 HTML / CSS
CSS3田字格列表的样式编写方法
2018/11/22 HTML / CSS
基于HTML5 Canvas 实现弹出框效果
2017/06/05 HTML / CSS
HTML5: Web 标准最巨大的飞跃
2008/10/17 HTML / CSS
Bravofly德国:预订廉价航班和酒店
2019/09/22 全球购物
用C#语言写出在本地创建一个UDP接收端口的具体过程
2016/02/22 面试题
办公文员的工作岗位职责
2013/11/12 职场文书
大学生演讲稿范文
2014/01/11 职场文书
毕业生简历自我评价范文
2014/04/09 职场文书
金融系毕业生自荐书
2014/07/08 职场文书
毕业典礼邀请函
2015/01/31 职场文书
停电放假通知
2015/04/14 职场文书
新学期新寄语,献给新生们!
2019/11/15 职场文书
html粘性页脚的具体使用
2022/01/18 HTML / CSS
详解OpenCV曝光融合
2022/04/29 Python