Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
在IIS服务器上以CGI方式运行Python脚本的教程
Apr 25 Python
pyenv命令管理多个Python版本
Mar 26 Python
Python对象类型及其运算方法(详解)
Jul 05 Python
Python简单定义与使用字典dict的方法示例
Jul 25 Python
python机器学习之神经网络(一)
Dec 20 Python
Python定时发送消息的脚本:每天跟你女朋友说晚安
Oct 21 Python
Python爬虫抓取技术的一些经验
Jul 12 Python
django admin.py 外键,反向查询的实例
Jul 26 Python
python使用PIL和matplotlib获取图片像素点并合并解析
Sep 10 Python
使用python处理题库表格并转化为word形式的实现
Apr 14 Python
Keras loss函数剖析
Jul 06 Python
Python中logging日志记录到文件及自动分割的操作代码
Aug 05 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
基于数据库的在线人数,日访问量等统计
2006/10/09 PHP
解析PHP获取当前网址及域名的实现代码
2013/06/23 PHP
php使用类继承解决代码重复的问题
2015/02/11 PHP
浅谈laravel-admin的sortable和orderby使用问题
2019/10/03 PHP
Javascript 表单之间的数据传递代码
2008/12/04 Javascript
javascript之querySelector和querySelectorAll使用介绍
2011/12/20 Javascript
Jquery动态进行图片缩略的原理及实现
2013/08/13 Javascript
js将控件隐藏及display属性的使用介绍
2013/12/30 Javascript
nodejs读取memcache示例分享
2014/01/02 NodeJs
js 触发select onchange事件代码
2014/03/20 Javascript
jQuery+HTML5实现手机摇一摇换衣特效
2015/06/05 Javascript
第一篇初识bootstrap
2016/06/21 Javascript
Javascript表单特效之十大常用原理性样例代码大总结
2016/07/12 Javascript
jQuery焦点图左右转换效果
2016/12/12 Javascript
Jquery EasyUI Datagrid右键菜单实现方法
2016/12/30 Javascript
jQuery中layer分页器的使用
2017/03/13 Javascript
React性能优化系列之减少props改变的实现方法
2019/01/17 Javascript
vue.js实现会动的简历(包含底部导航功能,编辑功能)
2019/04/08 Javascript
js实现图片上传即时显示效果
2019/09/30 Javascript
使用 Github Actions 自动部署 Angular 应用到 Github Pages的方法
2020/07/20 Javascript
antd 表格列宽自适应方法以及错误处理操作
2020/10/27 Javascript
python使用Berkeley DB数据库实例
2014/09/26 Python
用Python中的__slots__缓存资源以节省内存开销的方法
2015/04/02 Python
Python实现的括号匹配判断功能示例
2018/08/25 Python
对Python 窗体(tkinter)文本编辑器(Text)详解
2018/10/11 Python
python word转pdf代码实例
2019/08/16 Python
Python socket模块方法实现详解
2019/11/05 Python
python excel和yaml文件的读取封装
2021/01/12 Python
意大利团购网站:Groupon意大利
2016/10/11 全球购物
艺术爱好者的自我评价分享
2013/10/08 职场文书
自我鉴定怎么写
2014/01/12 职场文书
中介公司区域经理岗位职责范本
2014/03/02 职场文书
2014大学生批评与自我批评思想汇报
2014/09/21 职场文书
HTML5 语义化标签(移动端必备)
2021/08/23 HTML / CSS
CSS实现五种常用的2D转换
2021/12/06 HTML / CSS
python数据分析之单因素分析线性拟合及地理编码
2022/06/25 Python