`

Jsoup抓取

阅读更多

1.下载Jsoup核心库

 

地址: http://jsoup.org/download

 

 

2.使用

  /**
	 * 获取Document对象
	 * @param sourceUrl		连接地址
	 * @return doc 			Document对象
	 * @author chitianxiang $Feb 6th, 2012
	 */
	static private Document getDocment(String sourceUrl) throws IOException {
		Connection conn = Jsoup.connect(sourceUrl);
		/*
		 * 500错误
		 * 对方知道你是爬虫,直接给你拒绝访问,
		 * header中要加入相关信息,稍微伪装下
		 */
		conn.header("User-Agent", "Mozilla/5.0 (Macintosh; " 
				+ "U; Intel Mac OS X 10.4; en-US; " 
				+ "rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2"); 
		
		return conn.timeout(MAX_CONNECT_TIME).get();
	}
 
	private static final int JSOUP_TIMEOUT_MAX_VAL = 10000;	//Jsoup抓取时最长响应时间
	private static Map<String, String[]> data; //数据源

	private static final String SINA = "新浪";
	private static final String IFENG = "凤凰网";

  /**
	 * 初始化加载数据源
	 * @author chitianxiang 2011/11/3
	 */
	public static void init() throws Exception{
		System.out.println("开始加载数据源...");
		if (null != data && !data.isEmpty()) {
			return;
		}
		data = new HashMap<String, String[]>();
		
		data.put("焦点新闻1", new String[]{SINA, "http://rss.sina.com.cn/news/china/politics15.xml"});
		data.put("焦点新闻2", new String[]{IFENG, "http://news.ifeng.com/mainland/"});
	}

  /**
	 * 获取对应的数据源信息
	 * @param intactTypeName 完整类型名称
	 * @author chitianxiang 2011/11/3
	 */
	public static List<String[]> getDataLst(String intactTypeName) 
			throws Exception{

		List<String[]> list = new ArrayList<String[]>();
		
		if (null == data) {
			init();
		}
		for (String str : data.keySet()) {
			if ((str.indexOf(intactTypeName) != -1) 
					&& (null != data.get(str))) {
				
				list.add(data.get(str));
			}
		}
		
		return list;
	}

       /**
	 * 抓取列表数据
	 * @param sourceName 网站名称
	 * @param sourceUrl 网址
	 * @param map 集合
	 * @author chitianxiang 2011/11/3
	 */
	public static void doGrab2Lst(String sourceName, String sourceUrl, Map map) throws Exception{
		if (SINA.equals(sourceName)) {
			grabLstBySina(sourceName, sourceUrl, map);
		} else if (IFENG.equals(sourceName)) {
			grabLstByIfeng(sourceName. sourceUrl, map);
		}
	}

   /**
	 * 抓取新浪阅读列表数据
         * @param sourceName 网站名称
         * @param sourceUrl 网址
	 * @param map 集合
	 * @author chitianxiang 2011/11/5
	 */
	private static void grabLstBySina(String sourceName, String sourceUrl, Map map)
			throws Exception{
		
		try {
			Document doc = getDocument(sourceUrl);
        Elements elements = doc.select("item");
			
			for (Element element : elements) {
				String title = element.select("title").text(); //标题
				String content = element.select("description").text(); //显示内容
				if ("".equals(showContent)) {
					continue;
				}
				String outUrl= element.select("guid").text(); //外链URl
				
			}
		} catch (Exception e) {
			System.out.println("抓取" + sourceName + "失败!!!");
		}
	}
 

 

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics