Python实现大文件排序的方法


Posted in Python onJuly 10, 2015

本文实例讲述了Python实现大文件排序的方法。分享给大家供大家参考。具体实现方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述对大家的Python程序设计有所帮助。

Python 相关文章推荐
python线程锁(thread)学习示例
Dec 04 Python
python使用正则表达式分析网页中的图片并进行替换的方法
Mar 26 Python
python实现定时播放mp3
Mar 29 Python
Python进程间通信用法实例
Jun 04 Python
Python搭建APNS苹果推送通知推送服务的相关模块使用指南
Jun 02 Python
python调用API实现智能回复机器人
Apr 10 Python
【python】matplotlib动态显示详解
Apr 11 Python
python pandas模块基础学习详解
Jul 03 Python
对django2.0 关联表的必填on_delete参数的含义解析
Aug 09 Python
keras model.fit 解决validation_spilt=num 的问题
Jun 19 Python
解决python pandas读取excel中多个不同sheet表格存在的问题
Jul 14 Python
Python 实现劳拉游戏的实例代码(四连环、重力四子棋)
Mar 03 Python
Python实现telnet服务器的方法
Jul 10 #Python
Python读写unicode文件的方法
Jul 10 #Python
Python实现提取谷歌音乐搜索结果的方法
Jul 10 #Python
python和bash统计CPU利用率的方法
Jul 10 #Python
Python多线程下载文件的方法
Jul 10 #Python
Python爬取国外天气预报网站的方法
Jul 10 #Python
Python实现比较两个文件夹中代码变化的方法
Jul 10 #Python
You might like
在Mac OS的PHP环境下安装配置MemCache的全过程解析
2016/02/15 PHP
php中序列化与反序列化详解
2017/02/13 PHP
使用正则替换变量
2007/05/05 Javascript
JavaScript经典效果集锦
2010/07/06 Javascript
精心挑选的12款优秀的基于jQuery的手风琴效果插件和教程
2012/08/22 Javascript
更靠谱的H5横竖屏检测方法(js代码)
2016/09/13 Javascript
TableSort.js表格排序插件使用方法详解
2017/02/10 Javascript
AngularJS使用ng-class动态增减class样式的方法示例
2017/05/18 Javascript
Bootstrap一款超好用的前端框架
2017/09/25 Javascript
jQuery中过滤器的基本用法示例
2017/10/11 jQuery
使用Vue做一个简单的todo应用的三种方式的示例代码
2018/10/20 Javascript
vue数据操作之点击事件实现num加减功能示例
2019/01/19 Javascript
vue路由--网站导航功能详解
2019/03/29 Javascript
使用70行Python代码实现一个递归下降解析器的教程
2015/04/17 Python
一个月入门Python爬虫学习,轻松爬取大规模数据
2018/01/03 Python
python中for用来遍历range函数的方法
2018/06/08 Python
实例讲解Python爬取网页数据
2018/07/08 Python
python pygame实现2048游戏
2018/11/20 Python
django ManyToManyField多对多关系的实例详解
2019/08/09 Python
python实现七段数码管和倒计时效果
2019/11/23 Python
Tensorflow 多线程与多进程数据加载实例
2020/02/05 Python
基于python实现数组格式参数加密计算
2020/04/21 Python
python用分数表示矩阵的方法实例
2021/01/11 Python
美国著名的家居用品购物网站:Bed Bath & Beyond
2018/01/05 全球购物
丝芙兰加拿大官方网站:SEPHORA加拿大
2018/11/20 全球购物
英国银首饰公司:e&e Jewellery
2021/02/11 全球购物
女大学生自我鉴定
2013/12/09 职场文书
公益活动邀请函
2014/02/05 职场文书
春季运动会广播稿大全
2014/02/19 职场文书
餐厅执行经理岗位职责范本
2014/02/26 职场文书
寒假家长评语大全
2014/04/16 职场文书
铁路安全事故反思
2014/04/26 职场文书
四风问题查摆材料
2014/08/25 职场文书
新党章心得体会
2014/09/04 职场文书
2014司机年终工作总结
2014/12/05 职场文书
幼儿园卫生保健制度
2015/08/05 职场文书