Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
Python发送Email方法实例
Aug 21 Python
python使用Flask框架获取用户IP地址的方法
Mar 21 Python
python3中str(字符串)的使用教程
Mar 23 Python
详解多线程Django程序耗尽数据库连接的问题
Oct 08 Python
Python 可变类型和不可变类型及引用过程解析
Sep 27 Python
学python安装的软件总结
Oct 12 Python
tensorflow通过模型文件,使用tensorboard查看其模型图Graph方式
Jan 23 Python
tensorflow 分类损失函数使用小记
Feb 18 Python
Python读取配置文件(config.ini)以及写入配置文件
Apr 08 Python
python argparse模块通过后台传递参数实例
Apr 20 Python
keras 自定义loss model.add_loss的使用详解
Jun 22 Python
python向xls写入数据(包括合并,边框,对齐,列宽)
Feb 02 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
PHP下escape解码函数的实现方法
2010/08/08 PHP
浅谈PHP与C#的值类型指向区别的详解
2013/05/21 PHP
基于PHP5魔术常量与魔术方法的详解
2013/06/13 PHP
PHP多文件上传实例
2015/07/09 PHP
实例讲解yii2.0在php命令行中运行的步骤
2015/12/01 PHP
PHP设计模式之工厂模式详解
2017/10/24 PHP
PHP+redis实现的限制抢购防止商品超发功能详解
2019/09/19 PHP
javascript复制对象使用说明
2011/06/28 Javascript
Ajax执行顺序流程及回调问题分析
2012/12/10 Javascript
js解析与序列化json数据(三)json的解析探讨
2013/02/01 Javascript
返回顶部按钮响应滚动且动态显示与隐藏
2014/10/14 Javascript
JS实现样式清新的横排下拉菜单效果
2015/10/09 Javascript
如何实现JavaScript动态加载CSS和JS文件
2020/12/28 Javascript
Node.js环境下编写爬虫爬取维基百科内容的实例分享
2016/06/12 Javascript
JS导出PDF插件的方法(支持中文、图片使用路径)
2016/07/12 Javascript
AngularJs Understanding the Controller Component
2016/09/02 Javascript
AngularJS中$watch和$timeout的使用示例
2016/09/20 Javascript
vue2.0结合Element实现select动态控制input禁用实例
2017/05/12 Javascript
基于Vuejs和Element的注册插件的编写方法
2017/07/03 Javascript
详解JavaScript的BUG和错误
2018/05/07 Javascript
jQuery实现常见的隐藏与展示列表效果示例
2018/06/04 jQuery
JavaScript类型相关的常用操作总结
2019/02/14 Javascript
vue实现动态按钮功能
2019/05/13 Javascript
js刷新页面location.reload()用法详解
2019/12/09 Javascript
基于Django filter中用contains和icontains的区别(详解)
2017/12/12 Python
详解Python中如何写控制台进度条的整理
2018/03/07 Python
对Python中range()函数和list的比较
2018/04/19 Python
python 批量修改/替换数据的实例
2018/07/25 Python
太阳镜仓库,售价20美元或更少:Sunglass Warehouse
2016/09/28 全球购物
Jogun Shop中文官网:韩国知名时尚男装网站
2016/10/12 全球购物
德国电子商城:ComputerUniverse
2017/04/21 全球购物
医护人员英文求职信范文
2013/11/26 职场文书
求职推荐信范文
2013/12/01 职场文书
暖通工程师岗位职责
2014/06/12 职场文书
学习党的群众路线剖析材料
2014/10/09 职场文书
python实现的人脸识别打卡系统
2021/05/08 Python