java使用sax解析xml

wangq17發表於2016-12-05

目的:解析xml檔案,並存入mysql,並且要解析的欄位能一一對應.這裡解析的是微博的檔案,想要利用裡面的article和person_id欄位.

思路:

為了能得到person_id和article能一一對應.因此對兩個欄位分別解析,並且定義一個私有變數ct,在過載的函式startElement中自動加1.這個ct作為插入mysql中的article和person_id的主鍵即(ct,article)和(ct,person_id),在分別插入兩張不同的表a和b之後,兩個表做連線操作,實現article和person_id的一一對應(曲線救國啊!!!)

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;

public class sax_parse_xml extends DefaultHandler {

java.util.Stack tags = new java.util.Stack();
private long ct=0;
public static boolean isLegalXMLCharacter(int ch) {
    if (ch <= 0xD7FF) {
       if(ch<=0x0){return false;}
        if (ch >= 0x20) {
            return true;
        } else {
            return ch == '\n' || ch == '\r' || ch == '\t';
        }
    }
    else{
        return (ch >= 0xE000 && ch <= 0xFFFD) || (ch >= 0x10000 && ch <= 0x10FFFF);
    }



}

public sax_parse_xml() {
super();
}

public static void main(String args[]) {
long lasting = System.currentTimeMillis();
try {
    SAXParserFactory sf = SAXParserFactory.newInstance();
    SAXParser sp = sf.newSAXParser();
    sax_parse_xml reader = new sax_parse_xml();
    sp.parse(new InputSource("/home/hadoop/weibo_content_corpus/nlpir_weibo_content"), reader);
   } catch (Exception e) {
    e.printStackTrace();
   }

System.out.println((int)'運');
   System.out.println("執行時間：" + (System.currentTimeMillis() - lasting)
     + "毫秒");
}

public void characters(char ch[], int start, int length)
   throws SAXException {
   String tag = (String) tags.peek();
   String ch1 = "";
   String ch2="";
   //System.out.print(ch.length);
   //long ct=0;

//下面的程式向檔案寫入解析的xml的結果
   File file = new File("/home/hadoop/weibo_content_corpus", "addfile.txt");
   if(!file.exists())
   {
    try {
       file.createNewFile(); // 建立檔案
   } catch (IOException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
   }
   }

   File file1 = new File("/home/hadoop/weibo_content_corpus", "add_id.txt");
   if(!file1.exists())
   {
    try {
       file1.createNewFile(); // 建立檔案
   } catch (IOException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
   }
   }
   // 向檔案寫入內容(輸出流)
   String str = "java外挖出1\n";
   byte bt[] = new byte[1024];
   bt = str.getBytes();
/* try {
       // 開啟一個寫檔案器，建構函式中的第二個引數true表示以追加形式寫檔案
       FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/addfile.txt", true);
       writer.write(str);
       writer.close();
   } catch (IOException e) {
       e.printStackTrace();
   }*/
   //上面的程式向檔案寫入解析的xml的結果

   if (tag.equals("article")) {
       System.out.println("article:");
       String tmpStr=new String(ch, start, length);
       if(tmpStr.trim().length()>0)
       {
       //System.out.println(new String(ch, start, length));
       ch1="insert into tb_xml_article_hd1 values ("+ct+","+"\""+tmpStr+"\""+");";//生成匯入mysql的指令碼
       bt = ch1.getBytes();
       try {
           // 開啟一個寫檔案器，建構函式中的第二個引數true表示以追加形式寫檔案
           FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/addfile.txt", true);
           writer.write(ch1+"\n"+"commit;"+"\n");
           writer.close();
       } catch (IOException e) {
           e.printStackTrace();
       }
       System.out.println(ch1);
      // ct++;
       //StringBuffer sb = new StringBuffer();
       //sb.delete(0, sb.length());
       /*
       for (int i=start;i<length;i++)
       {

           if(Character.isDefined(ch[i]))//(isLegalXMLCharacter(ch[i]))
           {
              //System.out.println(ch[i]);
           }
       }*/
      // System.out.println(start);
       //System.out.println(length);
       //sb.append(ch, start, length);
          //System.out.println(ch1);
       }
   }
   if (tag.equals("person_id")) {
       //ch1=ct+":"+new String(ch, start, length);
       String tmpStr=new String(ch, start, length);
       if(tmpStr.trim().length()>0)
       {
       /*ch1="insert into tb_xml_person_hd values ("+(ct-4)+","+"\""+tmpStr+"\""+");";
       bt = ch1.getBytes();
       try {
           // 開啟一個寫檔案器，建構函式中的第二個引數true表示以追加形式寫檔案
           FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/add_id.txt", true);
           writer.write(ch1+"\n"+"commit;"+"\n");
           writer.close();
       } catch (IOException e) {
           e.printStackTrace();
       }
       System.out.println(ch1);
       */
       }
    //System.out.println("personid:");
    //System.out.println( new String(ch, start, length));
// ch1=ch1+new String(ch, start, length);
       //ch1=new String(ch, start, length);
      // ch2=new String(ch, start, length);
      // System.out.println(ch1);
   }

if (tag.equals("time")) {
   // System.out.println("time:");
   // System.out.println(new String(ch, start, length));
   // ch1=ch1+new String(ch, start, length);
      //ch1.concat(new String(ch, start, length));
      //System.out.println(ch1);
   }
// System.out.println(ch1);
//ch1="";

}

public void startElement(String uri, String localName, String qName,
    Attributes attrs) {
   tags.push(qName);
   ct=ct+1;
   //System.out.println(ct);
}
}

XML解析-最快的方式SAX
2018-05-02
XML
使用 Java 解析XML檔案
2022-04-07
JavaXML
sax解析例子演示
2018-06-05
Java解析XML
2018-07-10
JavaXML
jaxp的sax解析操作
2018-09-10
dom解析和sax解析的區別
2020-11-05
java使用jaxb解析XML（含根據xml自動生成實體類）
2018-08-21
JavaXML
使用DocumentBuilderFactory解析XML淺談
2023-11-14
UIXML
Xml解析
2020-11-27
XML
go 解析xml
2019-08-18
GoXML
iOS – XML解析
2019-03-04
iOSXML
java的XML解析(DOM4J技術)
2019-01-10
JavaXML
如何使用 ABAP 程式碼解析 XML 檔案
2022-09-04
XML
java EE開發之Servlet第五課：xml解析
2022-11-03
JavaServletXML
Java設計模式模式（包括工廠模式xml解析）
2018-07-09
Java設計模式XML
請利用SAX編寫程式解析Yahoo的XML格式的天氣預報，獲取天氣預報——python學習筆記
2019-01-05
XMLPython筆記
Java 解析xml報文放入Map，並判斷所有xml標籤是否為空
2018-11-06
JavaXML
python之XML解析
2019-09-23
PythonXML
175.XML解析
2020-10-18
XML
C# XML解析
2018-03-30
C#XML
使用JAXP對xml文件進行DOM解析基礎
2021-09-09
XML
python XML 檔案解析
2024-04-24
PythonXML
jdom解析xml檔案
2020-11-09
XML
Python XML解析之DOM
2018-09-29
PythonXML
XML 檔案解析實踐 (DOM 解析)
2020-10-02
XML
使用XML和Java混合控制UI頁面
2020-10-25
XMLJavaUI
XML DOM 解析器概述
2022-02-26
XML
搞懂 XML 解析，徒手造 WEB 框架
2020-04-20
XMLWeb框架
【spring原始碼系列】之【xml解析】
2021-04-19
Spring原始碼XML
Python解析XML檔案生成HTML
2019-02-16
PythonXMLHTML
使用pegjs解析java程式碼
2020-10-20
JSJava
NPM酷庫051：xml2js，解析XML格式資料
2019-02-16
NPMXMLJS
《手寫Mybatis》第4章：Mapper XML的解析和註冊使用
2022-04-11
MyBatisAPPXML
使用jaxp解析器dom方式對xml節點進行操作
2018-03-25
XML
使用FOR XML AUTO控制XML輸出KH
2022-03-21
XML
java基礎之XML
2018-12-11
JavaXML
Java-進階篇【Junit單元測試、反射、註解、動態代理、XML、XML解析、XPath、設計模式】---10
2024-05-25
Java反射XML設計模式
XML DOM 解析器錯誤概述
2022-06-05
XML
【Go學習筆記16】解析xml
2018-03-15
Go筆記XML

java使用sax解析xml

相關文章