JAVA切分大檔案

壹頁書發表於2014-05-14
假設一個很大的文字檔案,每行以\n結束。
下面的程式碼,可以切分指定的大檔案,並且保證每個檔案都是\n結束。

這是外部排序切分的前提條件。


  1. import java.io.BufferedInputStream;
  2. import java.io.BufferedOutputStream;
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.FileOutputStream;
  6. import java.io.IOException;
  7. import java.io.RandomAccessFile;
  8. import java.nio.ByteBuffer;
  9. import java.nio.MappedByteBuffer;
  10. import java.nio.channels.FileChannel;
  11. import java.nio.channels.FileChannel.MapMode;
  12. import java.util.ArrayList;
  13. import java.util.List;

  14. public class Sort {
  15.     public static void main(String[] args) throws IOException {
  16.         Sort s = new Sort();
  17.         s.splitByChannel("F:/t.txt", 10, "F:/");
  18.     }

  19.     /**
  20.      * 使用IO流切分指定檔案
  21.      */
  22.     public List<File> splitByStream(String file, int piece, String outputDirectiry) throws IOException {
  23.         List<File> result = new ArrayList<File>();
  24.         List<Point> list = blocking(new File(file), piece);
  25.         for (int i = 0; i < list.size(); i++) {
  26.             File outputFile = new File(outputDirectiry + i + "_byStream.txt");
  27.             BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));
  28.             BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile));
  29.             in.skip(list.get(i).getSkipSize());
  30.             int index = 0;
  31.             while (index < list.get(i).getLength()) {
  32.                 out.write(in.read());
  33.                 index++;
  34.             }
  35.             out.flush();
  36.             out.close();
  37.             in.close();
  38.             result.add(outputFile);
  39.         }
  40.         return result;
  41.     }

  42.     /**
  43.      * 使用記憶體對映檔案切分指定檔案
  44.      */
  45.     public List<File> splitByMappedByteBuffer(String file, int piece, String outputDirectiry) throws IOException {
  46.         List<File> result = new ArrayList<File>();
  47.         List<Point> list = blocking(new File(file), piece);
  48.         for (int i = 0; i < list.size(); i++) {
  49.             File outputFile = new File(outputDirectiry + i + "_byMappedByteBuffer.txt");
  50.             FileChannel in = new RandomAccessFile(file, "r").getChannel();
  51.             FileChannel out = new RandomAccessFile(outputFile, "rw").getChannel();
  52.             MappedByteBuffer outBuffer = out.map(MapMode.READ_WRITE, 0, list.get(i).length);
  53.             MappedByteBuffer inBuffer = in.map(MapMode.READ_ONLY, list.get(i).getSkipSize(), list.get(i).getLength());
  54.             outBuffer.put(inBuffer);
  55.             outBuffer.force();
  56.             in.close();
  57.             out.close();
  58.             result.add(outputFile);
  59.         }
  60.         return result;
  61.     }

  62.     /**
  63.      * 使用通道切分指定檔案
  64.      */
  65.     public List<File> splitByChannel(String file, int piece, String outputDirectiry) throws IOException {
  66.         List<File> result = new ArrayList<File>();
  67.         List<Point> list = blocking(new File(file), piece);
  68.         for (int i = 0; i < list.size(); i++) {
  69.             File outputFile = new File(outputDirectiry + i + "_byChannel.txt");
  70.             FileChannel in = new FileInputStream(file).getChannel();
  71.             FileChannel out = new FileOutputStream(outputFile).getChannel();
  72.             ByteBuffer buffer = ByteBuffer.allocate(list.get(i).getLength());
  73.             in.read(buffer, list.get(i).getSkipSize());
  74.             buffer.flip();
  75.             out.write(buffer);
  76.             in.close();
  77.             out.close();
  78.             result.add(outputFile);
  79.         }
  80.         return result;
  81.     }

  82.     /**
  83.      * 對檔案進行切分 1.先根據指定的引數分片,每個分片以\n結束 2。根據分片的情況,計算切點
  84.      */
  85.     private List<Point> blocking(File file, int piece) throws IOException {
  86.         List<Point> result = new ArrayList<Point>();
  87.         List<Long> list = new ArrayList<Long>();
  88.         list.add(-1L);
  89.         long length = file.length();
  90.         long step = length / piece;
  91.         long index = 0;
  92.         for (int i = 0; i < piece; i++) {
  93.             BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));
  94.             if (index + step < length) {
  95.                 index = index + step;
  96.                 in.skip(index);

  97.                 while (in.read() != 10) {
  98.                     index = index + 1;
  99.                 }
  100.                 list.add(index);
  101.                 index++;
  102.             }
  103.             in.close();
  104.         }
  105.         list.add(length - 1);
  106.         System.out.println(list);
  107.         for (int i = 0; i < list.size() - 1; i++) {
  108.             long skipSize = list.get(i) + 1;
  109.             long l = list.get(i + 1) - list.get(i);
  110.             result.add(new Point(skipSize, l));

  111.         }
  112.         System.out.println(result);
  113.         return result;
  114.     }

  115.     /**
  116.      * 切分檔案的切點 skipSize指的是從流跳過的size length指的是從流讀出的長度
  117.      */
  118.     private class Point {
  119.         public Point(long skipSize, long length) {
  120.             if (length > Integer.MAX_VALUE) {
  121.                 throw new RuntimeException("長度溢位");
  122.             }
  123.             this.skipSize = skipSize;
  124.             this.length = (int) length;
  125.         }

  126.         @Override
  127.         public String toString() {
  128.             return "Point [skipSize=" + skipSize + ", length=" + length + "]\n";
  129.         }

  130.         private long skipSize;
  131.         private int length;

  132.         public long getSkipSize() {
  133.             return skipSize;
  134.         }

  135.         public int getLength() {
  136.             return length;
  137.         }

  138.     }
  139. }


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29254281/viewspace-1161173/,如需轉載,請註明出處,否則將追究法律責任。

相關文章