目的:解析xml檔案,並存入mysql,並且要解析的欄位能一一對應.這裡解析的是微博的檔案,想要利用裡面的article和person_id欄位.
思路:
為了能得到person_id和article能一一對應.因此對兩個欄位分別解析,並且定義一個私有變數ct,在過載的函式startElement中自動加1.這個ct作為插入mysql中的article和person_id的主鍵即(ct,article)和(ct,person_id),在分別插入兩張不同的表a和b之後,兩個表做連線操作,實現article和person_id的一一對應(曲線救國啊!!!)
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
public class sax_parse_xml extends DefaultHandler {
java.util.Stack tags = new java.util.Stack();
private long ct=0;
public static boolean isLegalXMLCharacter(int ch) {
if (ch <= 0xD7FF) {
if(ch<=0x0){return false;}
if (ch >= 0x20) {
return true;
} else {
return ch == '\n' || ch == '\r' || ch == '\t';
}
}
else{
return (ch >= 0xE000 && ch <= 0xFFFD) || (ch >= 0x10000 && ch <= 0x10FFFF);
}
}
public sax_parse_xml() {
super();
}
public static void main(String args[]) {
long lasting = System.currentTimeMillis();
try {
SAXParserFactory sf = SAXParserFactory.newInstance();
SAXParser sp = sf.newSAXParser();
sax_parse_xml reader = new sax_parse_xml();
sp.parse(new InputSource("/home/hadoop/weibo_content_corpus/nlpir_weibo_content"), reader);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println((int)'運');
System.out.println("執行時間:" + (System.currentTimeMillis() - lasting)
+ "毫秒");
}
public void characters(char ch[], int start, int length)
throws SAXException {
String tag = (String) tags.peek();
String ch1 = "";
String ch2="";
//System.out.print(ch.length);
//long ct=0;
//下面的程式向檔案寫入解析的xml的結果
File file = new File("/home/hadoop/weibo_content_corpus", "addfile.txt");
if(!file.exists())
{
try {
file.createNewFile(); // 建立檔案
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
File file1 = new File("/home/hadoop/weibo_content_corpus", "add_id.txt");
if(!file1.exists())
{
try {
file1.createNewFile(); // 建立檔案
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// 向檔案寫入內容(輸出流)
String str = "java外挖出1\n";
byte bt[] = new byte[1024];
bt = str.getBytes();
/* try {
// 開啟一個寫檔案器,建構函式中的第二個引數true表示以追加形式寫檔案
FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/addfile.txt", true);
writer.write(str);
writer.close();
} catch (IOException e) {
e.printStackTrace();
}*/
//上面的程式向檔案寫入解析的xml的結果
if (tag.equals("article")) {
System.out.println("article:");
String tmpStr=new String(ch, start, length);
if(tmpStr.trim().length()>0)
{
//System.out.println(new String(ch, start, length));
ch1="insert into tb_xml_article_hd1 values ("+ct+","+"\""+tmpStr+"\""+");";//生成匯入mysql的指令碼
bt = ch1.getBytes();
try {
// 開啟一個寫檔案器,建構函式中的第二個引數true表示以追加形式寫檔案
FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/addfile.txt", true);
writer.write(ch1+"\n"+"commit;"+"\n");
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(ch1);
// ct++;
//StringBuffer sb = new StringBuffer();
//sb.delete(0, sb.length());
/*
for (int i=start;i<length;i++)
{
if(Character.isDefined(ch[i]))//(isLegalXMLCharacter(ch[i]))
{
//System.out.println(ch[i]);
}
}*/
// System.out.println(start);
//System.out.println(length);
//sb.append(ch, start, length);
//System.out.println(ch1);
}
}
if (tag.equals("person_id")) {
//ch1=ct+":"+new String(ch, start, length);
String tmpStr=new String(ch, start, length);
if(tmpStr.trim().length()>0)
{
/*ch1="insert into tb_xml_person_hd values ("+(ct-4)+","+"\""+tmpStr+"\""+");";
bt = ch1.getBytes();
try {
// 開啟一個寫檔案器,建構函式中的第二個引數true表示以追加形式寫檔案
FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/add_id.txt", true);
writer.write(ch1+"\n"+"commit;"+"\n");
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(ch1);
*/
}
//System.out.println("personid:");
//System.out.println( new String(ch, start, length));
// ch1=ch1+new String(ch, start, length);
//ch1=new String(ch, start, length);
// ch2=new String(ch, start, length);
// System.out.println(ch1);
}
if (tag.equals("time")) {
// System.out.println("time:");
// System.out.println(new String(ch, start, length));
// ch1=ch1+new String(ch, start, length);
//ch1.concat(new String(ch, start, length));
//System.out.println(ch1);
}
// System.out.println(ch1);
//ch1="";
}
public void startElement(String uri, String localName, String qName,
Attributes attrs) {
tags.push(qName);
ct=ct+1;
//System.out.println(ct);
}
}