自娛小程式–超大檔案topN

毒狼發表於2016-09-28
 設計思路:
new 一個執行緒沲
開啟檔案,Nio或Reader,NIO開啟,Map一大塊MappedByteBuffer,從Buffer中讀出一定大小的資料,定位到最後一個`
`,最後一個`
`及之前的資料put到一個執行緒執行類例項Buffer中,餘下的put到一個臨時Buffer裡,下一次迴圈時處理這部分內容,線上程的執行體中,先行rewind bytebuffer,迴圈處理buffer,讀到一個完整的import語句put到map裡,buffer處理完成後合併map到全域性concurrentmap中。BIO的則是讀一定的行數後submit執行緒到執行緒沲,之後,用正規表示式處理每一行生成map,處理完成後
合併map
上程式碼:
=========================NIO================================= 

package com.arvey.files;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.nio.ByteBuffer;

import java.nio.MappedByteBuffer;

import java.nio.channels.FileChannel;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.concurrent.ConcurrentHashMap;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

public class StateCounterNIO {

public static final int mappedSize = 5*4*1024;

public static final int handleSize = 4*1024;

public final static int ExecutorsNum = 200;

String file=“/Users/arvey/wwork/Docs.code.clean/docsapp/bigfile.src”;

//String file=”/Users/arvey/wwork/Docs.full/source.code.201512.clean/bigfile.src”;

//String file=”/Users/arvey/wwork/gsafety.code.donotdeleted/cloud-core/bigfile.src”;

//String file=”/Users/arvey/wwork/Docs.full/source.code.201512.clean/CleanerType.java”;

public static ConcurrentHashMap<String,Integer> result = new ConcurrentHashMap<String,Integer>();

//public static Pattern pattern = Pattern.compile(“^(import.+);”);

private ExecutorService pool = Executors.newFixedThreadPool(ExecutorsNum);

//private HandleBuffer aHandle;

public static synchronized void updateTopList(Map<String,Integer> partial){

for(String key:partial.keySet()){

if(result.containsKey(key)){

result.put(key, (Integer)result.get(key)+(Integer)partial.get(key));

}else

result.put(key, (Integer)partial.get(key));

}

}

public void getTop10(){

File aFile = new File(file);

long fileLength = aFile.length();

FileInputStream fis = null;

FileChannel fc = null;

long foffset = 0L;

//MappedByteBuffer buffer = (MappedByteBuffer) MappedByteBuffer.allocate(5*4*1024);

MappedByteBuffer buffer = null;

ByteBuffer tmpByteBuffer = ByteBuffer.allocate(StateCounterNIO.handleSize);

byte[] tmpByteArray = new byte[StateCounterNIO.handleSize];

try {

fis = new FileInputStream(aFile);

fc = fis.getChannel();

while(foffset<fileLength){

long buffersize = Math.min(fileLengthfoffset,StateCounterNIO.mappedSize);

buffer = fc.map(FileChannel.MapMode.READ_ONLY, foffset, buffersize);

while( buffer.position() < buffersize ){

HandleBuffer aHandle = new HandleBuffer();

//boolean submit = false;

if(tmpByteBuffer.position() > 0){

byte[] tmpba = new byte[tmpByteBuffer.position()];

tmpByteBuffer.rewind();

tmpByteBuffer.get(tmpba);

aHandle.getMbuffer().put(tmpba);

tmpByteBuffer.clear();

}

int tmpBACap = Math.min(Math.min(StateCounterNIO.handleSize, (int)(buffersizebuffer.position())), StateCounterNIO.handleSizeaHandle.getMbuffer().position() );

buffer.get(tmpByteArray,0,tmpBACap);

//end of file

if(buffer.position() == buffersize && (foffset+buffersize == fileLength)){

aHandle.getMbuffer().put(tmpByteArray,0,tmpBACap);

} else {

for( int i = tmpBACap-1;i>=0;i— ){

if(i == 0){//this means that no `
` in the whole buffer, then put full handle buffer and submit

tmpByteBuffer.put(tmpByteArray,0,tmpBACap);

if ( tmpByteArray[i] == `
`
){

aHandle.getMbuffer().put(tmpByteArray, 0, i);

//put those byte into tmpByteBuffer which will handle with next buffer

if( i != tmpBACap-1 )

tmpByteBuffer.put(tmpByteArray,i,tmpBACapi);

break;

}

}

}

pool.submit( aHandle );

}

foffset += buffer.position();

buffer.clear();

}

//if(pool.

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} finally {

if (fis != null)

try {

fis.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

if( fc != null )

try {

fc.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

pool.shutdown();

while(!pool.isTerminated()){

try {

Thread.sleep(2000);

} catch (InterruptedException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

class HandleBuffer implements Runnable{

ByteBuffer mbuffer = ByteBuffer.allocate(4*1024);

public ByteBuffer getMbuffer(){

return mbuffer;

}

@Override

public void run() {

Map<String,Integer> aMap = new HashMap<String,Integer>();

byte[] bimport = “import “.getBytes();

int bimport_index = 0,markedpos = 0; 

boolean isImportline = false;

int availabesize = mbuffer.position();

mbuffer.rewind();

while(mbuffer.position() < availabesize)

{

//mbuffer.

byte abyte = mbuffer.get();

if(!isImportline && bimport_index< bimport.length && abyte == bimport[bimport_index] ){

bimport_index++;

if( bimport_index == bimport.length ){

isImportline = true;

markedpos = mbuffer.position() – bimport.length;

}

else if( abyte == `
`
&& isImportline){

byte[] tmp = new byte[mbuffer.position() – markedpos];

mbuffer.position(markedpos);

mbuffer.get(tmp);

String aImport = new String( tmp ).trim();

if(aMap.containsKey(aImport)){

aMap.put(aImport, (Integer)aMap.get(aImport)+1);

}else{

aMap.put(aImport, 1);

}

isImportline = false;

bimport_index=0;

} else if(!isImportline && bimport_index != 0){//清除沒有讀到完整”import “時的index

bimport_index = 0;

}

}

StateCounterNIO.updateTopList(aMap);

}

}

public static void main(String[] args) {

// TODO Auto-generated method stub

long startat = System.currentTimeMillis();

StateCounterNIO aNIO = new StateCounterNIO();

aNIO.getTop10();

List<Map.Entry<String,Integer>> slist = new ArrayList<Map.Entry<String,Integer>>(result.entrySet());

Collections.sort(slist,new Comparator<Map.Entry<String,Integer>>(){

@Override

public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {

if(o2.getValue()!=null&&o1.getValue()!=null&&o2.getValue().compareTo(o1.getValue())>0){  

return 1;

}else{

return -1;

}

}

});

int index=0;

for(Map.Entry<String,Integer> aEntry: slist){

System.out.println(aEntry.getKey() + “–“+ aEntry.getValue());

if(index++>=100)

break;

//System.out.println(“The Thread counter is ” + aCount.getPoolcounter());

System.out.println(“The cost is “ + (System.currentTimeMillis()-startat) );

}

}

==================================BIO======================================

package com.arvey.files;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StateCountBIO {
public final static int ExecutorsNum = 200;
ExecutorService pool = Executors.newFixedThreadPool(ExecutorsNum);
String file=”/Users/arvey/wwork/Docs.code.clean/docsapp/bigfile.src”;
//String file=”/Users/arvey/wwork/Docs.full/source.code.201512.clean/bigfile.src”;
//String file=”/Users/arvey/wwork/gsafety.code.donotdeleted/cloud-core/bigfile.src”;
//String file=”/Users/arvey/wwork/Docs.full/source.code.201512.clean/CleanerType.java”;
public static ConcurrentHashMap<String,Integer> result = new ConcurrentHashMap<String,Integer>();
public static Pattern pattern = Pattern.compile(“^(import.+);”);
private BufferedReader freader;
private int poolcounter = 0;
public int getPoolcounter(){
return poolcounter;
}
public static synchronized void updateTopList(Map<String,Integer> partial){
for(String key:partial.keySet()){
if(result.containsKey(key)){
result.put(key, (Integer)result.get(key)+(Integer)partial.get(key));
}else
result.put(key, (Integer)partial.get(key));
}
}
public void getTop10(){
File bigfile = new File(file);
FileInputStream fio = null;
try {
fio = new FileInputStream(bigfile);
//BufferedReader freader = new BufferedReader(new InputStreamReader(new FileInputStream(bigfile)));
//FileChannel frchannel = fio.getChannel();
InputStreamReader areader = new InputStreamReader(fio);
freader = new BufferedReader(areader);
boolean notreachedend = true;
while(notreachedend){
String content=null;
int index = 0;
HandleRun anInst = new HandleRun();
StringBuilder aWriter = anInst.getBuffer();
while (index < 1024){
if( (content = freader.readLine())!=null){
aWriter.append(content+”
“);
index++;
} else {
notreachedend = false;
break;
}
}
//System.out.println(“Start a new Thread!”);
poolcounter++;
pool.submit(anInst);
}
//areader
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally{
try {
fio.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
pool.shutdown();
while( !pool.isTerminated() ){
try {
Thread.sleep( 2000 );
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class HandleRun implements Runnable{
StringBuilder buffer = new StringBuilder();
public StringBuilder getBuffer(){
return buffer;
}
@Override
public void run() {
// TODO Auto-generated method stub
String[] allLines = buffer.toString().split(”
“);
Map<String,Integer> res = new HashMap<String,Integer>();
for(String aLine:allLines){
Matcher m = pattern.matcher(aLine);
if(m.find()){
String key = m.group(0);
if(res.containsKey(key)){
res.put(key, (Integer)res.get(key)+1);
}else
res.put(key, 1);
}
}
StateCountBIO.updateTopList(res);
//System.out.println(“The current Thread complementd!”);
}
}
static class ValueComparator implements Comparator<String>{
Map<String,Integer> map = result;
public ValueComparator(){
}
@Override
public int compare(String o1, String o2) {
// TODO Auto-generated method stub
if(map.get(o1) >= map.get(o2))
return -1;
else
return 1;
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
long startat = System.currentTimeMillis();
StateCountBIO aCount = new StateCountBIO();
aCount.getTop10();
List<Map.Entry<String,Integer>> slist = new ArrayList<Map.Entry<String,Integer>>(result.entrySet());
Collections.sort(slist,new Comparator<Map.Entry<String, Integer>>(){
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2){
 if(o2.getValue()!=null&&o1.getValue()!=null&&o2.getValue().compareTo(o1.getValue())>0){  
           return 1;  
          }else{  
           return -1;  
          }
}
});
int index=0;
for(Map.Entry<String,Integer> aEntry: slist){
System.out.println(aEntry.getKey() + “–“+ aEntry.getValue());
if(index++>=100)
break;
System.out.println(“The Thread counter is ” + aCount.getPoolcounter());
System.out.println(“The cost is ” + (System.currentTimeMillis()-startat) );
//for(String key:sorted_map.keySet()){
//System.out.println(key+”—-“+sorted_map.get(key));
//}
//for(String key:result.keySet()){
//System.out.println(key+”—-“+result.get(key));
//}
}
}

=====================================================
效率分析
處理大檔案 檔案size達到
8967006720時 執行緒沲200 100 50對比如下(五次執行平均結果):
                       NIO                BIO
200                139843          67376 
100                136914          66576
50                  140000          67249  
為何NIO的要慢於BIO的呢?
NIO在處理執行緒中遍例buffer,是不是這個原因造成的呢?當增加每次buffer處理的容量時,效能提升明顯,如檔案每次map的和每一個執行緒處理的buffer的空間擴容10增時,在50個執行緒時,資料降到82439,但是對於BIO的調整一次處理的行數,效能變化很小,程式執行時間略有增長(726**),then什麼才能獲得最好的效能呢!

相關文章