Python实现多线程抓取妹子图


Posted in Python onAugust 08, 2015

心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。

Pic_downloader.py

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 07 17:30:58 2015
 
@author: Dreace
"""
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()
def rename():
  return time.strftime("%Y%m%d%H%M%S")
def rename_2(name): 
  if len(name) == 2: 
    name = '0' + name + '.jpg' 
  elif len(name) == 1: 
    name = '00' + name + '.jpg' 
  else: 
    name = name + '.jpg' 
  return name
def download_pic(i):
  global count
  global time_out
  if Filter(i):
    try: 
      content = urllib2.urlopen(i,timeout = time_out)
      url_content = content.read()
      f = open(repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count)),"wb")
      f.write(url_content)
      f.close()
      count += 1
    except Exception, e:
      print i + "下载超时,跳过!".decode("utf-8").encode(type_)
def Filter(content):
  for line in Filter_list:
    line=line.strip('\n')
    if content.find(line) == -1:
      return True
def get_pic(url_address):
  global pic_list
  try:
    str_ = urllib2.urlopen(url_address, timeout = time_out).read()
    url_content = str_.split("\"")
    for i in url_content:
      if i.find(".jpg") != -1:
        pic_list.append(i)  
  except Exception, e:
    print "获取图片超时,跳过!".decode("utf-8").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name = "C:\Photos\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = "http://sexy.faceks.com/?page="
for i in range(1,MAX + 1): 
  page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print "获取到".decode("utf-8").encode(type_),len(pic_list),"张图片,开始下载!".decode("utf-8").encode(type_)
pool = ThreadPool(thread_num) 
pool.map(download_pic,pic_list)
pool.close() 
pool.join()
print count,"张图片保存在".decode("utf-8").encode(type_) + dir_name
print "共耗时".decode("utf-8").encode(type_),time.time() - start_time,"s"

我们来看下一个网友的作品

#coding: utf-8 #############################################################
# File Name: main.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Wed 11 Jun 2014 08:22:12 PM CST
#########################################################################
#!/usr/bin/python

import re,urllib2,HTMLParser,threading,Queue,time

#各图集入口链接
htmlDoorList = []
#包含图片的Hmtl链接
htmlUrlList = []
#图片Url链接Queue
imageUrlList = Queue.Queue(0)
#捕获图片数量
imageGetCount = 0
#已下载图片数量
imageDownloadCount = 0
#每个图集的起始地址,用于判断终止
nextHtmlUrl = ''
#本地保存路径
localSavePath = '/data/1920x1080/'

#如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str = '1920x1080'

replaced_str = '960x600'

#内页分析处理类
class ImageHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.nextUrl = ''
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
global imageUrlList
if(tag == 'img' and len(attrs) > 2 ):
if(attrs[0] == ('id','bigImg')):
url = attrs[1][1]
url = url.replace(replaced_str,replace_str)
imageUrlList.put(url)
global imageGetCount
imageGetCount = imageGetCount + 1
print url
elif(tag == 'a' and len(attrs) == 4):
if(attrs[0] == ('id','pageNext') and attrs[1] == ('class','next')):
global nextHtmlUrl
nextHtmlUrl = attrs[2][1];

#首页分析类
class IndexHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = ''
self.tagList = ['li','a']
self.classList = ['photo-list-padding','pic']
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if(tag == self.tagList[self.index]):
for attr in attrs:
if (attr[1] == self.classList[self.index]):
if(self.index == 0):
#第一层找到了
self.index = 1
else:
#第二层找到了
self.index = 0
print attrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag == 'a'):
for attr in attrs:
if (attr[0] == 'id' and attr[1] == 'pageNext'):
self.nextUrl = attrs[1][1]
print 'nextUrl:',self.nextUrl
break

#首页Hmtl解析器
indexParser = IndexHtmlParser()
#内页Html解析器
imageParser = ImageHtmlParser()

#根据首页得到所有入口链接
print '开始扫描首页...'
host = 'http://desk.zol.com.cn'
indexUrl = '/meinv/'
while (indexUrl != ''):
print '正在抓取网页:',host+indexUrl
request = urllib2.Request(host+indexUrl)
try:
m = urllib2.urlopen(request)
con = m.read()
indexParser.feed(con)
if (indexUrl == indexParser.nextUrl):
break
else:
indexUrl = indexParser.nextUrl
except urllib2.URLError,e:
print e.reason

print '首页扫描完成,所有图集链接已获得:'
htmlDoorList = indexParser.urlList

#根据入口链接得到所有图片的url
class getImageUrl(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for door in htmlDoorList:
print '开始获取图片地址,入口地址为:',door
global nextHtmlUrl
nextHtmlUrl = ''
while(door != ''):
print '开始从网页%s获取图片...'% (host+door)
if(nextHtmlUrl != ''):
request = urllib2.Request(host+nextHtmlUrl)
else:
request = urllib2.Request(host+door)
try:
m = urllib2.urlopen(request)
con = m.read()
imageParser.feed(con)
print '下一个页面地址为:',nextHtmlUrl
if(door == nextHtmlUrl):
break
except urllib2.URLError,e:
print e.reason
print '所有图片地址均已获得:',imageUrlList

class getImage(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global imageUrlList
print '开始下载图片...'
while(True):
print '目前捕获图片数量:',imageGetCount
print '已下载图片数量:',imageDownloadCount
image = imageUrlList.get()
print '下载文件路径:',image
try:
cont = urllib2.urlopen(image).read()
patter = '[0-9]*\.jpg';
match = re.search(patter,image);
if match:
print '正在下载文件:',match.group()
filename = localSavePath+match.group()
f = open(filename,'wb')
f.write(cont)
f.close()
global imageDownloadCount
imageDownloadCount = imageDownloadCount + 1
else:
print 'no match'
if(imageUrlList.empty()):
break
except urllib2.URLError,e:
print e.reason
print '文件全部下载完成...'

get = getImageUrl()
get.start()
print '获取图片链接线程启动:'

time.sleep(2)

download = getImage()
download.start()
print '下载图片链接线程启动:'

批量抓取指定网页上的所有图片

# -*- coding:utf-8 -*-
# coding=UTF-8
 
import os,urllib,urllib2,re
 
url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath = "t:\\"
 
def getHtml(url):
  webfile = urllib.urlopen(url)
  outhtml = webfile.read()
  print outhtml
  return outhtml
 
def getImageList(html):
  restr=ur'('
  restr+=ur'http:\/\/[^\s,"]*\.jpg'
  restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
  restr+=ur'|http:\/\/[^\s,"]*\.png'
  restr+=ur'|http:\/\/[^\s,"]*\.gif'
  restr+=ur'|http:\/\/[^\s,"]*\.bmp'
  restr+=ur'|https:\/\/[^\s,"]*\.jpeg'  
  restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
  restr+=ur'|https:\/\/[^\s,"]*\.png'
  restr+=ur'|https:\/\/[^\s,"]*\.gif'
  restr+=ur'|https:\/\/[^\s,"]*\.bmp'
  restr+=ur')'
  htmlurl = re.compile(restr)
  imgList = re.findall(htmlurl,html)
  print imgList
  return imgList
 
def download(imgList, page):
  x = 1
  for imgurl in imgList:
    filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
    print '[Debug] Download file :'+ imgurl+' >> '+filepathname
    urllib.urlretrieve(imgurl,filepathname)
    x+=1
 
def downImageNum(pagenum):
  page = 1
  pageNumber = pagenum
  while(page <= pageNumber):
    html = getHtml(url)#获得url指向的html内容
    imageList = getImageList(html)#获得所有图片的地址,返回列表
    download(imageList,page)#下载所有的图片
    page = page+1
 
if __name__ == '__main__':
  downImageNum(1)

以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。

Python 相关文章推荐
Python文件操作之合并文本文件内容示例代码
Sep 19 Python
python3操作微信itchat实现发送图片
Feb 24 Python
TensorFlow实现RNN循环神经网络
Feb 28 Python
python 制作自定义包并安装到系统目录的方法
Oct 27 Python
python利用百度AI实现文字识别功能
Nov 27 Python
python检测IP地址变化并触发事件
Dec 26 Python
详解python中TCP协议中的粘包问题
Mar 22 Python
python如何制作英文字典
Jun 25 Python
Python实现word2Vec model过程解析
Dec 16 Python
python多进程下的生产者和消费者模型
May 07 Python
python中如何进行连乘计算
May 28 Python
Django实现简单的分页功能
Feb 22 Python
通过Python来使用七牛云存储的方法详解
Aug 07 #Python
Python爬虫框架Scrapy实战之批量抓取招聘信息
Aug 07 #Python
深入理解Python中命名空间的查找规则LEGB
Aug 06 #Python
举例详解Python中yield生成器的用法
Aug 05 #Python
Python中return语句用法实例分析
Aug 04 #Python
python函数形参用法实例分析
Aug 04 #Python
Python简明入门教程
Aug 04 #Python
You might like
动画 《Pokemon Sword·Shield》系列WEB动画《薄明之翼》第2话声优阵容公开!
2020/03/06 日漫
PHP函数常用用法小结
2010/02/08 PHP
PHP批量去除BOM头内容信息代码
2016/03/11 PHP
php实现简单爬虫的开发
2016/03/28 PHP
PHPExcel 修改已存在Excel的方法
2018/05/03 PHP
Phpstorm+Xdebug断点调试PHP的方法
2018/05/14 PHP
Laravel 关联模型-关联新增和关联更新的方法
2019/10/10 PHP
datePicker——日期选择控件(with jquery)
2007/02/20 Javascript
javascript学习笔记(五)正则表达式
2011/04/08 Javascript
jQuery插件原来如此简单 jQuery插件的机制及实战
2012/02/07 Javascript
js操作iframe父子窗体示例
2014/05/22 Javascript
js动态修改整个页面样式达到换肤效果
2014/05/23 Javascript
js用typeof方法判断undefined类型
2014/07/15 Javascript
Javascript函数的参数
2015/07/16 Javascript
jQuery常用样式操作实例分析(获取、设置、追加、删除、判断等)
2016/09/08 Javascript
Knockout结合Bootstrap创建动态UI实现产品列表管理
2016/09/14 Javascript
Angularjs通过指令监听ng-repeat渲染完成后执行脚本的方法
2016/12/31 Javascript
微信小程序 动态绑定数据及动态事件处理
2017/03/14 Javascript
基于EasyUI的基础之上实现树形功能菜单
2017/06/28 Javascript
最全正则表达式总结:验证QQ号、手机号、Email、中文、邮编、身份证、IP地址等
2017/08/16 Javascript
JS实现遍历不规则多维数组的方法
2018/03/21 Javascript
vue监听对象及对象属性问题
2018/08/20 Javascript
JavaScript 反射和属性赋值实例解析
2019/10/28 Javascript
Vue页面刷新记住页面状态的实现
2019/12/27 Javascript
js实现全选和全不选功能
2020/07/28 Javascript
Python中运行并行任务技巧
2015/02/26 Python
Python编程之event对象的用法实例分析
2017/03/23 Python
Python中循环引用(import)失败的解决方法
2018/04/22 Python
PyQt5 对图片进行缩放的实例
2019/06/18 Python
如何使用Python实现自动化水军评论
2019/06/26 Python
django最快程序开发流程详解
2019/07/19 Python
怎样有效的进行自我评价
2013/10/06 职场文书
慰问信格式规范
2015/03/23 职场文书
企业培训简报范文
2015/07/20 职场文书
全家福照片寄语怎么写?
2019/04/02 职场文书
利用Python判断你的密码难度等级
2021/06/02 Python