백엔드기술/개발언어
한자를 한글로 변환하기
RevFactory
2010. 7. 22. 15:46
시험용 데이터에 한자가 너무 많아서 한글로의 변환이 필요했다.
한자를 한글로 변환하는 코드 출처는 아래와 같다.
http://devhome.tistory.com/20
[메인코드]
public static void main(String[] args) throws UnsupportedEncodingException, JAXBException, FileNotFoundException {
String keyword = "黃旼浩";
Hanja hanja = new Hanja();
System.out.println(hanja.toHangle(keyword));
}
String keyword = "黃旼浩";
Hanja hanja = new Hanja();
System.out.println(hanja.toHangle(keyword));
}
[결과]
황민호
급히 만든 간단한 한자변환 클래스
[Hanja.java]
package hanjatohangle;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
/**
* @author Hwang
*/
public class Hanja {
private static char[] HANJA_TO_HANGLE_MAP = new char[65565];
* @author Hwang
*/
public class Hanja {
private static char[] HANJA_TO_HANGLE_MAP = new char[65565];
public Hanja() throws FileNotFoundException, JAXBException {
loadXMLData();
}
loadXMLData();
}
private void loadXMLData() throws FileNotFoundException, JAXBException {
JAXBContext context = JAXBContext.newInstance(HanjaToHangle.class);
JAXBContext context = JAXBContext.newInstance(HanjaToHangle.class);
//XML을 자바 Object로 변환하기 위한 Unmarshaller 작성
Unmarshaller unmarshaller = context.createUnmarshaller();
//unmarshaller.setSchema(schema);
Unmarshaller unmarshaller = context.createUnmarshaller();
//unmarshaller.setSchema(schema);
//XML에서 자바 Object로 변환
HanjaToHangle root = (HanjaToHangle) unmarshaller.unmarshal(new FileInputStream("hanjatohangle.xml"));
HanjaToHangle root = (HanjaToHangle) unmarshaller.unmarshal(new FileInputStream("hanjatohangle.xml"));
StringTokenizer stok = new StringTokenizer(root.unicodeMap.get(0).code, ",");
int i = 0;
while(stok.hasMoreTokens()) {
int value = Integer.decode(stok.nextToken().trim());
HANJA_TO_HANGLE_MAP[i++] = (char)value;
}
}
int i = 0;
while(stok.hasMoreTokens()) {
int value = Integer.decode(stok.nextToken().trim());
HANJA_TO_HANGLE_MAP[i++] = (char)value;
}
}
public static String toHangle(String hanja) throws UnsupportedEncodingException {
char unicode = 0x0000;
byte[] hanjaByte = hanja.getBytes("UTF-8");
for(int i = 0 ; i < hanjaByte.length; ) {
if( (hanjaByte[i]&0xFF) < 0x80 ) {
i++;
continue;
} else if( (hanjaByte[i]&0xFF) < 0xE0 ) {
i += 2;
continue;
} else if( (hanjaByte[i]&0xFF) < 0xF0 ) {
unicode = (char)(hanjaByte[i] & 0x0f);
i++;
unicode = (char)(unicode << 6);
unicode = (char)(unicode | (hanjaByte[i] & 0x3f));
i++;
unicode = (char)(unicode << 6);
unicode = (char)(unicode | (hanjaByte[i] & 0x3f));
i++;
}
char unicode = 0x0000;
byte[] hanjaByte = hanja.getBytes("UTF-8");
for(int i = 0 ; i < hanjaByte.length; ) {
if( (hanjaByte[i]&0xFF) < 0x80 ) {
i++;
continue;
} else if( (hanjaByte[i]&0xFF) < 0xE0 ) {
i += 2;
continue;
} else if( (hanjaByte[i]&0xFF) < 0xF0 ) {
unicode = (char)(hanjaByte[i] & 0x0f);
i++;
unicode = (char)(unicode << 6);
unicode = (char)(unicode | (hanjaByte[i] & 0x3f));
i++;
unicode = (char)(unicode << 6);
unicode = (char)(unicode | (hanjaByte[i] & 0x3f));
i++;
}
if(HANJA_TO_HANGLE_MAP[unicode] != unicode) {
unicode = HANJA_TO_HANGLE_MAP[unicode];
hanjaByte[i-1] = (byte)((unicode & 0x3f) | 0x80);
hanjaByte[i-2] = (byte)(((unicode << 2) & 0x3f00 | 0x8000) >> 8);
hanjaByte[i-3] = (byte)(((unicode << 4) & 0x3f0000 | 0xe00000) >> 16);
continue;
}
}
return (new String(hanjaByte, "UTF-8"));
}
}
unicode = HANJA_TO_HANGLE_MAP[unicode];
hanjaByte[i-1] = (byte)((unicode & 0x3f) | 0x80);
hanjaByte[i-2] = (byte)(((unicode << 2) & 0x3f00 | 0x8000) >> 8);
hanjaByte[i-3] = (byte)(((unicode << 4) & 0x3f0000 | 0xe00000) >> 16);
continue;
}
}
return (new String(hanjaByte, "UTF-8"));
}
}
@XmlRootElement
class HanjaToHangle {
HanjaToHangle() {
}
@XmlElement
List<UnicodeMap> unicodeMap;
}
class HanjaToHangle {
HanjaToHangle() {
}
@XmlElement
List<UnicodeMap> unicodeMap;
}
class UnicodeMap {
UnicodeMap() {
}
@XmlElement
String code;
}
UnicodeMap() {
}
@XmlElement
String code;
}
[UNICODE가 들어있는 XML파일]
- 배열의 인덱스와 값을 매칭시켜 사용한다.
- 프로젝트 폴더로 복사
*평소 XML파일을 읽고 쓰는데 파서를 이용하다가 JAXB를 사용해보았는데.. 와우 장난아니다. 진작에 써볼껄...
배우는걸 게을리하면 몸이 고생한다는 말을 체득했다. ㅎㅎ
XML Element 이름의 대소문자 주의!
* int value = Integer.decode(stok.nextToken().trim());
이부분은 String에 0x0000 처럼 들어있는 문자열을 Integer형태로 디코딩한다.
추가. JAXB 튜도리얼
http://blog.bdoughan.com/2010/10/how-does-jaxb-compare-to-xstream.html
추가2. 프로젝트 전체 파일을 업로드 합니다.