Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
Python实现的百度站长自动URL提交小工具
Jun 27 Python
Python中处理字符串之islower()方法的使用简介
May 19 Python
Python的Tornado框架实现异步非阻塞访问数据库的示例
Jun 30 Python
Python计时相关操作详解【time,datetime】
May 26 Python
Python3实现带附件的定时发送邮件功能
Dec 22 Python
详解Django中类视图使用装饰器的方式
Aug 12 Python
Python用字典构建多级菜单功能
Jul 11 Python
详解如何用TensorFlow训练和识别/分类自定义图片
Aug 05 Python
Pytorch 使用CNN图像分类的实现
Jun 16 Python
python实现mask矩阵示例(根据列表所给元素)
Jul 30 Python
python利用paramiko实现交换机巡检的示例
Sep 22 Python
Python实现双向链表
May 25 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
PL-880隐藏功能
2021/03/01 无线电
PHP聊天室技术
2006/10/09 PHP
PHP中通过trigger_error触发PHP错误示例
2015/06/23 PHP
PHP实现linux命令tail -f
2016/02/22 PHP
php+ajax登录跳转登录实现思路
2016/07/31 PHP
Laravel5中Cookie的使用详解
2017/05/03 PHP
PHP实现的DES加密解密类定义与用法示例
2020/11/02 PHP
禁止刷新,回退的JS
2006/11/25 Javascript
Jquery替换已存在于element上的event的方法
2010/03/09 Javascript
JS 事件绑定函数代码
2010/04/28 Javascript
Extjs4 关于Store的一些操作(加载/回调/添加)
2013/04/18 Javascript
jQuery制作简洁的多级联动Select下拉框
2014/12/23 Javascript
JS实现AES加密并与PHP互通的方法分析
2017/04/19 Javascript
使用nodejs+express实现简单的文件上传功能
2017/12/27 NodeJs
JavaScript实现构造json数组的方法分析
2018/08/17 Javascript
解决vue select当前value没有更新到vue对象属性的问题
2018/08/30 Javascript
JS实现指定区域的全屏显示功能示例
2019/04/25 Javascript
Vuejs学习笔记之使用指令v-model完成表单的数据双向绑定
2019/04/29 Javascript
vue3.0 搭建项目总结(详细步骤)
2019/05/20 Javascript
Vue动态面包屑功能的实现方法
2019/07/01 Javascript
vue点击标签切换选中及互相排斥操作
2020/07/17 Javascript
Vue3+elementui plus创建项目的方法
2020/12/01 Vue.js
[53:36]Liquid vs VP Supermajor决赛 BO 第三场 6.10
2018/07/05 DOTA
Python爬虫辅助利器PyQuery模块的安装使用攻略
2016/04/24 Python
Linux下python3.7.0安装教程
2018/07/30 Python
在python中pandas读文件,有中文字符的方法
2018/12/12 Python
Python搭建代理IP池实现检测IP的方法
2019/10/27 Python
Python异常原理及异常捕捉实现过程解析
2020/03/25 Python
Django Serializer HiddenField隐藏字段实例
2020/03/31 Python
django model 条件过滤 queryset.filter(**condtions)用法详解
2020/05/20 Python
初一科学教学反思
2014/01/27 职场文书
买房协议书
2014/04/11 职场文书
给妈妈洗脚活动方案
2014/08/16 职场文书
董事长秘书工作总结
2015/08/14 职场文书
建房合同协议书
2016/03/21 职场文书
CSS link与@import的区别和用法解析
2023/05/07 HTML / CSS