一個類似於 Native2Ascii 的東東

iceant發表於2003-09-03

同事晚上問到，我花了點時間趕製的，不知道有什麼問題，請大家在不同的環境下測試一下，有問題請回貼，謝謝。

package com.utstar.pizer.util.unicode;

import java.util.*;
import java.io.*;

/**
 * <p>Title: </p>
 * <p>Description: </p>
 * <p>Copyright: Copyright (c) 2003</p>
 * <p>Company: </p>
 * @author not attributable
 * @version 1.0
 */

public class UnicodeUtil {

  public static void main(String[] args) throws Exception {
    System.out.println(Integer.toHexString( (int) '你'));
    System.out.println(Integer.toHexString( (int) '我'));

    String tmp = "\\u" + "4F60" + "\\u" + "6211\u0000同\\u時a\\bc\u5e87DEf_)*&^\\u^";
    //String tmp = "\u4F60\u6211\u540c\u65f6\u20e6";
    System.out.println(tmp);
    System.out.println("[" + escapeUnicode(tmp) + "]");
    String tmp2 = escapeUnicode(tmp);
    OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("data.txt"),"UTF-8");
    osw.write(tmp2);
    osw.flush();
    osw.close();
    long start = System.currentTimeMillis();
    InputStreamReader isr = new InputStreamReader(new FileInputStream("data.txt"),"UTF-8");
    char[] inc = new char[1024*4];
    StringBuffer s = new StringBuffer();
    int p = 0;
    while((p=isr.read(inc,0,inc.length))>0){
      s.append(inc,0,p);
    }
    isr.close();
    System.out.println(unescapeUnicode(s.toString(), null));
    long end = System.currentTimeMillis();
    System.out.println("Time consumed:"+(end-start));
  }

  /*
     note: '\' is not a general escape, only the pair
     '\\u' and the triple '\\u' (for a literal "\\u")
     our unicode-escape parsing algorithm will go something like:
     1.
     scan for '\\u'
     2.
     if previous char is '\', skip all 3 of them. go to 1.
     3.
     if next 4 chars are legal hex, continue, else go to 1.
     4.
     turn into character.
     char foo = (char)Integer.parseInt(hexChars, 16);
     5.
     test with Character.isDefined(foo)
     6.
     insert into sb2, go to 1.
   */

  /**
   * Decode 4-hex-digit unicode escapes from a String.
   * Escapes are defined in
   * <a href=" http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850 ">&sect;3.3</a>
   * of the
   * <a href=" http://java.sun.com/docs/books/jls/second_edition/html/j.title.doc.html ">java language specification</a>.
   *
   * As a short example: "&#92;u00BF" would be translated into &iquest; (the inverted question mark)
   *
   * @param s the string to decode
   * @return the decoded string, or s if there were errors.
   */
  public static String unescapeUnicode(String s, String encoding) throws
      UnsupportedEncodingException {
    if (s == null) {
      return s;
    }

    String decoded = s;

    try {
      int sindex;
      int osindex;
      sindex = s.indexOf("\\u");
      osindex = 0;

      if (sindex >= 0) { // we can still have \\u, but we'll work it out.
        String hex4 = null;
        char tchar = '\u0000';

        // we will build up our new string in here:
        StringBuffer sb2 = new StringBuffer(s.length());
        // sb2.append( s.substring( 0, sindex ) );

        while ( (sindex >= 0) && (sindex < s.length())) {

          //System.out.println(osindex+","+sindex);
          sb2.append(s.substring(osindex, sindex));
          osindex = sindex;

          // we have a triple-esc, skip onward
          if ( (sindex > 0) && (s.charAt(sindex - 1) == '\\')) {
            sindex += 2; // the length of "\\u"
            sb2.append("u");
          }
          else { // check for 4 hex digits following \\u
            // make sure we _have_ 4 more chars:
            if (sindex + 6 > s.length()) {
              sb2.append(s.substring(sindex));
              break;
              //throw new StringIndexOutOfBoundsException();
            }
            hex4 = s.substring(sindex + 2, sindex + 6);
            try {
              tchar = (char) Integer.parseInt(hex4, 16);
            }
            catch (NumberFormatException nfe) {
              sb2.append(s.substring(sindex, sindex + 2));
              sindex+=2;
              osindex = sindex;
              sindex = s.indexOf("\\u", sindex);
              continue;
              //throw new StringIndexOutOfBoundsException();
            }

            // hex4 parsed to an int, now see if its a character...
            if (Character.isDefined(tchar)) {
              sb2.append(tchar);
            }
            else {
              sb2.append(tchar);
              //throw new StringIndexOutOfBoundsException();
            }
            sindex += 6;
          }

          osindex = sindex;
          sindex = s.indexOf("\\u", sindex);
        }

        if (sindex < 0) { // grab the rest of the string.
          sb2.append(s.substring(osindex));
        }

        decoded = sb2.toString();
        sb2 = null; // get rid of it
      }
    }
    catch (StringIndexOutOfBoundsException e) {
      // do nothing, s will be unaffected.
      return s;
    }
    if (encoding == null || "".equals(encoding)) {
      encoding = System.getProperty("file.encoding", "ISO-8859-1");
    }
    return new String(decoded.getBytes(encoding));

  }

  public static String escapeUnicode(String s) {
    if (s == null) {
      return s;
    }
    char[] chars = s.toCharArray();
    char c;
    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < chars.length; i++) {
      c = chars[i];
      //Ignore ascii character
      if (c > 0xff) {
        sb.append("\\u").append(Integer.toHexString(c));
      }
      else {
        sb.append(c);
      }
    }
    return sb.toString();
  }
}

<p class="indent">

西方哲學中有類似於東方哲學的“道”嗎？
2025-01-09
YUI可真是個不錯的東東
2019-05-31
UI
數字廣東：廣東省「一號工程」的第一個500天
2019-04-04
一個類似於Gridster的柵格佈局系統Vue元件
2023-10-30
Vue元件
使用VuePress搭建一個類似element的文件
2018-06-01
Vue
一個TextView設定多種格式(類似於“評論”的樣式)
2018-12-03
TextView
關於 Gradle 依賴庫的幾個東西
2019-03-16
Gradle
關於東莞開票-東莞開票
2020-11-25
js中的arguments是一個好東西
2018-09-23
JS
一個有意思的數學東西
2020-12-11
用Vue仿了一個類似抖音的App
2024-04-03
VueAPP
【like-react】手寫一個類似 react 的框架
2019-06-15
React框架
挖一挖@Bean這個東西
2019-01-25
Bean
【JavaScript框架封裝】實現一個類似於JQuery的動畫框架的封裝
2018-07-19
JavaScript框架封裝jQuery動畫
“警車”的“警燈”，類似於GIF
2019-10-15
【JavaScript框架封裝】實現一個類似於JQuery的CSS樣式框架的封裝
2018-07-19
JavaScript框架封裝jQueryCSS
系統設計：如何設計一個類似於Tinder的基於位置的社交搜尋應用
2022-02-02
採用 SwiftNIO 實現一個類似 Express 的 Web 框架
2018-08-09
SwiftExpressWeb框架
夢是個神奇的東西
2019-01-05
寫了一個 SRE 除錯工具，類似一個小木馬
2024-04-18
除錯
關於東莞哪裡可以開具票-東莞
2021-03-02
yearrecord——一個類似痕跡牆的React資料展示元件
2024-07-17
React元件
opacity這個小東西
2018-12-11
env 是個好東西
2023-04-26
[練手]CantoneseCool 一個能說廣東話的小程式。
2020-11-22
京東一號店
2024-03-14
使用css的attr()寫一個類似a標籤title的提示框
2024-12-04
CSS
關於東莞開會議費發票-開票服務大廳一東莞本地寶
2020-11-08
基於原生fetch封裝一個帶有攔截器功能的fetch，類似axios的攔截器
2019-02-24
封裝iOS
解釋一下一門語言該有的東東（Javascript）
2019-08-03
JavaScript
做一個類似賽博賞小程式大概多少錢
2023-02-20
封裝一個exec方式：直接把東西都拿到
2019-09-25
封裝
京東的18歲，劉強東的29年
2021-06-18
近期做的一些東西
2024-06-15
關於東莞哪裡有開住宿費發票-東莞本地寶
2020-11-07
關於東莞哪裡可以開住宿費發票-東莞本地寶
2020-06-06
關於東莞哪裡可以開手撕發票-東莞本地寶
2020-06-06
關於東莞哪裡可以開滴滴費 F 票-東莞
2021-03-06
TCP 的 Keepalive 和 HTTP 的 Keep-Alive 是一個東西嗎？
2021-07-08
TCPHTTPKeep-Alive

一個類似於 Native2Ascii 的東東

相關文章