编程 Python

Python实现大文件排序的方法

Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下：

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python实现大文件排序的方法

- Author -

Sephiroth

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

在IIS服务器上以CGI方式运行Python脚本的教程

Apr 25 Python

pyenv命令管理多个Python版本

Mar 26 Python

Python对象类型及其运算方法(详解)

Jul 05 Python

Python简单定义与使用字典dict的方法示例

Jul 25 Python

python机器学习之神经网络（一）

Dec 20 Python

Python定时发送消息的脚本:每天跟你女朋友说晚安

Oct 21 Python

Python爬虫抓取技术的一些经验

Jul 12 Python

django admin.py 外键,反向查询的实例

Jul 26 Python

python使用PIL和matplotlib获取图片像素点并合并解析

Sep 10 Python

使用python处理题库表格并转化为word形式的实现

Apr 14 Python

Keras loss函数剖析

Jul 06 Python

Python中logging日志记录到文件及自动分割的操作代码

Aug 05 Python

Python实现telnet服务器的方法

Jul 10 #Python

Python读写unicode文件的方法

Jul 10 #Python

Python实现提取谷歌音乐搜索结果的方法

Jul 10 #Python

python和bash统计CPU利用率的方法

Jul 10 #Python

Python多线程下载文件的方法

Jul 10 #Python

Python爬取国外天气预报网站的方法

Jul 10 #Python

Python实现比较两个文件夹中代码变化的方法

Jul 10 #Python

You might like

基于数据库的在线人数，日访问量等统计

2006/10/09 PHP

解析PHP获取当前网址及域名的实现代码

2013/06/23 PHP

php使用类继承解决代码重复的问题

2015/02/11 PHP

浅谈laravel-admin的sortable和orderby使用问题

2019/10/03 PHP

Javascript 表单之间的数据传递代码

2008/12/04 Javascript

javascript之querySelector和querySelectorAll使用介绍

2011/12/20 Javascript

Jquery动态进行图片缩略的原理及实现

2013/08/13 Javascript

js将控件隐藏及display属性的使用介绍

2013/12/30 Javascript

nodejs读取memcache示例分享

2014/01/02 NodeJs

js 触发select onchange事件代码

2014/03/20 Javascript

jQuery+HTML5实现手机摇一摇换衣特效

2015/06/05 Javascript

第一篇初识bootstrap

2016/06/21 Javascript

Javascript表单特效之十大常用原理性样例代码大总结

2016/07/12 Javascript

jQuery焦点图左右转换效果

2016/12/12 Javascript

Jquery EasyUI Datagrid右键菜单实现方法

2016/12/30 Javascript

jQuery中layer分页器的使用

2017/03/13 Javascript

React性能优化系列之减少props改变的实现方法

2019/01/17 Javascript

vue.js实现会动的简历(包含底部导航功能，编辑功能)

2019/04/08 Javascript

js实现图片上传即时显示效果

2019/09/30 Javascript

使用 Github Actions 自动部署 Angular 应用到 Github Pages的方法

2020/07/20 Javascript

antd 表格列宽自适应方法以及错误处理操作

2020/10/27 Javascript

python使用Berkeley DB数据库实例

2014/09/26 Python

用Python中的__slots__缓存资源以节省内存开销的方法

2015/04/02 Python

Python实现的括号匹配判断功能示例

2018/08/25 Python

对Python 窗体(tkinter)文本编辑器(Text)详解

2018/10/11 Python

python word转pdf代码实例

2019/08/16 Python

Python socket模块方法实现详解

2019/11/05 Python

python excel和yaml文件的读取封装

2021/01/12 Python

意大利团购网站：Groupon意大利

2016/10/11 全球购物

艺术爱好者的自我评价分享

2013/10/08 职场文书

自我鉴定怎么写

2014/01/12 职场文书

中介公司区域经理岗位职责范本

2014/03/02 职场文书

2014大学生批评与自我批评思想汇报

2014/09/21 职场文书

HTML5 语义化标签(移动端必备)

2021/08/23 HTML / CSS

CSS实现五种常用的2D转换

2021/12/06 HTML / CSS

python数据分析之单因素分析线性拟合及地理编码

2022/06/25 Python