웹 수집 소스 (1차)
기본 흐름은 아래와 같다.
1. HTTPS 로그인
2. 세션 유지
3. 페이지별 수집 URL의 Table 셀에서 데이터 수집
적용사항
*HTTPS페이지 접근시 인증관련하여 Handshake 예외상황을 처리하기 위한 우회방법 적용
*쿠키를 이용한 세션유지
*HTML 파서를 이용한 DOM 접근 방식의 데이터 파싱
WEBCrawler.java
- 로그인 URL과 수집 URL을 지정
- 페이지로 구분된 URL접근시 최대 페이지 적용
- Output은 구분자(|) 로 구분된 TEXT파일 ( 파일당 최대 10,000라인 )
package webcralwer;
import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.List; import java.util.Map; import javax.net.ssl.HttpsURLConnection;
/** * @since 2012.07.30 * @author rev */ public class WEBCrawler { private String loginUrl = "https://[LOGIN URL]/login?id=[ID]&password=[PASSWORD]"; private String crawlUrl = "http://[URL]/list?page="; private static String parsingType = "type1"; private static int maxPage = 9; private String cookies; private static OutputWriter writer = new OutputWriter(); public static void main(String[] args) { WEBCrawler cralwer = new WEBCrawler(); cralwer.login(); for(int i = 0 ; i <= maxPage; i++) { cralwer.crawl(i, type); } writer.close(); } public void login() { try { URL url = new URL(loginUrl); trustAllHttpsCertificates(); HttpsURLConnection conn = (HttpsURLConnection) url.openConnection(); conn.setUseCaches(true); conn.setInstanceFollowRedirects(false); conn.setRequestMethod("POST"); conn.setRequestProperty("Content-Type", "text/xml;charset=EUC-KR"); cookies = getCookie(conn); System.out.println("Login Complete..."); } catch (Exception e) { System.out.println(e); } } public String crawl(int pageNo, String type) { WEBParser parser = new WEBParser(writer); String result = ""; try { URL url = new URL(crawlUrl+pageNo); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setDoOutput(true); conn.setRequestProperty("Cookie", cookies); parser.parse(conn, parsingType); System.out.println("[" + pageNo + "] Crawl Complete..."); } catch (Exception e) { System.out.println(e); } return result; } private void trustAllHttpsCertificates() throws Exception { javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1]; trustAllCerts[0] = new SSLTrustManager(); javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext.getInstance("SSL"); sc.init(null, trustAllCerts, null); javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); HttpsURLConnection.setDefaultHostnameVerifier(new SSLHostnameVerifier()); } private String getCookie(URLConnection conn) { Map<String,List<String>> m = conn.getHeaderFields(); if(!m.containsKey("Set-Cookie")) { return ""; } boolean isFirst = true; StringBuilder sb = new StringBuilder(); for(String cookie : m.get("Set-Cookie")) { if(isFirst) isFirst = false; else sb.append(";"); sb.append(cookie); } return sb.toString(); } }
|
WEBParser.java
package webcralwer;
import java.io.IOException; import java.net.HttpURLConnection; import java.util.Iterator; import java.util.List; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source;
/** * * @author rev */ public class WEBParser { private OutputWriter writer = null; public WEBParser(OutputWriter writer) { this.writer = writer; } public void parse(HttpURLConnection conn, String type) { switch(type) { case "type1": htmlParse(conn, 2); break; case "type2": htmlParse(conn, 4); break; case "type3": htmlParse(conn, 4); break; default: break; } } private void htmlParse(HttpURLConnection conn, int orderDiv) { Source source = null; try { source = new Source(conn); } catch (IOException e) { } source.fullSequentialParse(); Element parentDiv = null; for(Element div : source.getAllElements(HTMLElementName.DIV)) { if("salary_tbl".equals(div.getAttributeValue("class"))) { parentDiv = div; break; } } if(parentDiv == null) return ; Element div = parentDiv.getChildElements().get(orderDiv); Element table = div.getAllElements(HTMLElementName.TABLE).get(0); Element tbody = table.getAllElements(HTMLElementName.TBODY).get(0); List trList = tbody.getAllElements(HTMLElementName.TR); Iterator iter = trList.iterator(); iter.next(); iter.next(); while(iter.hasNext()){ Element tr = (Element) iter.next(); List dataList = tr.getAllElements(HTMLElementName.TD); Iterator dataIter = dataList.iterator(); while(dataIter.hasNext()){ Element data = (Element) dataIter.next(); String value = data.getContent().getTextExtractor().toString(); writer.append(value).append("|"); } writer.newLine(); } } } |
HTML Parser Library
jericho-html-3.1.jar
SSLTrustManager.java
package webcralwer;
/** * * @author rev */ public class SSLTrustManager implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {
public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; }
public boolean isServerTrusted(java.security.cert.X509Certificate[] certs) { //System.out.println("X509CertificateSever : " + certs); return true; }
public boolean isClientTrusted(java.security.cert.X509Certificate[] certs) { //System.out.println("X509CertificateClient : " + certs); return true; }
public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { //System.out.println("AUTH TYPE Sever : " + certs[0]); return; }
public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { //System.out.println("AUTH TYPE Server : " + authType); return; } } |
SSLHostnameVerifier.java
package webcralwer;
import javax.net.ssl.HostnameVerifier; import javax.net.ssl.SSLSession;
/** * * @author rev */ public class SSLHostnameVerifier implements HostnameVerifier { @Override public boolean verify(String paramString, SSLSession paramSSLSession) { System.out.println("Warning: URL Host: " + paramString + " vs. " + paramSSLSession.getPeerHost()); return true; } } |
OutputWriter.java
package webcralwer;
import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException;
/** * * @author rev */ public class OutputWriter { private String file = "output"; private String extend = ".txt"; private int fileNo = 1; private int lineCnt = 1; final private static int MAXLINE = 10000; private BufferedWriter writer = null; public OutputWriter() { try { writer = new BufferedWriter(new FileWriter(this.getFileName())); } catch(IOException e) {} } public OutputWriter append(String s) { try { writer.write(s); } catch (IOException ex) { } return this; } public void write(String s) { try { writer.write(s); } catch (IOException ex) { } } public void newLine() { try { if(lineCnt > MAXLINE) { writer.close(); fileNo++; lineCnt = 1; writer = new BufferedWriter(new FileWriter(this.getFileName())); } else { writer.newLine(); lineCnt++; } } catch (IOException ex) { } } public void close() { try { writer.close(); } catch (IOException ex) { } } public String getFileName() { return file+ fileNo + extend; } } |