Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
Python素数检测实例分析
Jun 15 Python
Python+Selenium自动化实现分页(pagination)处理
Mar 31 Python
Python3实现对列表按元组指定列进行排序的方法分析
Dec 22 Python
Python Django模板之模板过滤器与自定义模板过滤器示例
Oct 18 Python
Python Django框架防御CSRF攻击的方法分析
Oct 18 Python
Python面向对象编程基础实例分析
Jan 17 Python
JAVA SWT事件四种写法实例解析
Jun 05 Python
解决redis与Python交互取出来的是bytes类型的问题
Jul 16 Python
Python爬虫定时计划任务的几种常见方法(推荐)
Jan 15 Python
python 爬取哔哩哔哩up主信息和投稿视频
Jun 07 Python
Python 处理表格进行成绩排序的操作代码
Jul 26 Python
python读取mnist数据集方法案例详解
Sep 04 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
加强版phplib的DB类
2008/03/31 PHP
PHP CKEditor 上传图片实现代码
2009/11/06 PHP
php 连接mssql数据库 初学php笔记
2010/03/01 PHP
destoon实现调用当前栏目分类及子分类和三级分类的方法
2014/08/21 PHP
PHP的serialize序列化数据以及JSON格式化数据分析
2015/10/10 PHP
PHP实现批量删除(封装)
2017/04/28 PHP
PHP实现的CURL非阻塞调用类
2018/07/26 PHP
laravel请求参数校验方法
2019/10/10 PHP
JS将数字转换成三位逗号分隔的样式(示例代码)
2014/02/19 Javascript
jQuery使用之标记元素属性用法实例
2015/01/19 Javascript
JavaScript操作Oracle数据库示例
2015/03/06 Javascript
简述Jquery与DOM对象
2015/07/10 Javascript
avalon js实现仿微博拖动图片排序
2015/08/14 Javascript
Bootstrap每天必学之表格
2015/11/23 Javascript
jQuery实现点击水纹波动动画
2016/04/10 Javascript
Vue组件模板形式实现对象数组数据循环为树形结构(实例代码)
2017/07/31 Javascript
jQuery接受后台传递的List的实例详解
2017/08/02 jQuery
防止页面url缓存中ajax中post请求的处理方法
2017/10/10 Javascript
详解NodeJs开发微信公众号
2018/05/25 NodeJs
vue实现循环切换动画
2018/10/17 Javascript
通过循环优化 JavaScript 程序
2019/06/24 Javascript
JS动态显示倒计时效果
2019/12/12 Javascript
一篇不错的Python入门教程
2007/02/08 Python
python使用正则表达式检测密码强度源码分享
2014/06/11 Python
Python切片用法实例教程
2014/09/08 Python
Django中URL视图函数的一些高级概念介绍
2015/07/20 Python
基于keras输出中间层结果的2种实现方式
2020/01/24 Python
numpy库ndarray多维数组的维度变换方法(reshape、resize、swapaxes、flatten)
2020/04/28 Python
python如何删除文件、目录
2020/06/23 Python
详解h5页面在不同ios设备上的问题总结
2019/03/01 HTML / CSS
毕业生教师求职信
2013/10/20 职场文书
大专生简历的自我评价
2013/11/26 职场文书
入党积极分子思想汇报
2014/01/02 职场文书
教师爱岗敬业演讲稿
2014/05/05 职场文书
奥巴马的演讲稿
2014/05/15 职场文书
诗词赏析-(浣溪沙)
2019/08/13 职场文书