`

抓取时网址不完整补全处理

阅读更多
/**
	 * 将不完整的网址转换成完整网址
	 * <pre>
	 * example:
	 * 	sourceUrl = "http://www.wed114.cn/jiehun/shishanghunli/hunlicehua/";
	 * 	toConvertUrl = "/jiehun/shishanghunli/hunlicehua/201106/1496.html";
	 * 	latestUrl = "http://www.wed114.cn/jiehun/shishanghunli/hunlicehua/201106/1496.html";
	 * 	注level可选
	 * 		level = 1; (默认)	prefixUrl = "http://www.wed114.cn";
	 * 		level = 2; 			prefixUrl = "http://www.wed114.cn/jiehun";
	 * 	如果待转换地址为"./"之类的使用上级的地址,直接去掉"."
	 * 	如果待转换的为完整的网址直接返回待转换网址
	 * </pre>
	 * @param sourceUrl 	当前资源网址
	 * @param toConvertUrl	抓取的待转换的网址
	 * @param level			取地址第几级(可选) 
	 * @return	latestUrl 	完整的网址
	 * @author  chitianxiang $9th April, 2012 - 10:45 a.m
	 */
	private static String getFullurl(String sourceUrl, String toConvertUrl, int... level) {
		String prefixUrl = sourceUrl;		//前缀地址
		String latestUrl = toConvertUrl;	//最终地址
		int srcLevel = 1;					
		if (level.length > 0) {
			srcLevel = level[0];
		}
		
		//如果不是http://开头
		if (toConvertUrl.toLowerCase().indexOf("http://") == -1) {
			int endIndex = -1;
			//srcLevel + 2 中 2表示http中的//
			try {
				endIndex = EmpInfoGrabUtil.indexOf(sourceUrl, '/', srcLevel + 2);
			} catch (Exception e){
				e.printStackTrace();
				System.out.println("完整地址转换出现异常!!!");
				return " ";
			}
			if (endIndex != -1) {
				prefixUrl = sourceUrl.substring(0, endIndex);
			}
			
			while (toConvertUrl.startsWith(".")) {
				toConvertUrl = toConvertUrl.substring(1);
			}
			
			if (!toConvertUrl.startsWith("/")) {
				toConvertUrl = "/" + toConvertUrl;
			}
			latestUrl = prefixUrl + toConvertUrl;
		} 
		
		return latestUrl;
	}
	
	/**
	 * 在字符串中查询指定第几个字符的索引
	 * @param str 			源字符串
	 * @param c				待搜索目标字符
	 * @param charNum		第几个目标字符		
	 * @return	符合要求的目标字符位置
	 * @author chitianxiang $9th April, 2012 - 10:25 a.m
	 */
	private static int indexOf(String str, char c, int charNum) {
		int index = -1;		//目标位置
		int count = 0; 		//第几个目标
		if (str == null || str.length() == 0) {
			return -1;
		}
		
		if (charNum <= 0) {
			throw new IllegalArgumentException("第几个目标必须为正整数");
		}
		
		for (int i = 0, len = str.length(); i < len; i++) {
			if (str.codePointAt(i) == c) {
				count++;
			}
			if (count == charNum) {
				index = i;
				break;
			}
		}
		
		if (charNum > count) {
			throw new IndexOutOfBoundsException("设置第几个目标过长,没有那么多目标");
		}
		
		return index;
	}


 运用:

String sourceUrl = "http://www.wed114.cn/jiehun/shishanghunli/hunlicehua/";
String toConvertUrl = "/jiehun/shishanghunli/hunlicehua/201106/1496.html";   
getFullurl(sourceUrl, toConvertUrl);
http://www.wed114.cn/jiehun/shishanghunli/hunlicehua/201106/1496.html


String sourceUrl = "http://www.wed114.cn/jiehun/shishanghunli/hunlicehua/";
String toConvertUrl = "/shishanghunli/hunlicehua/201106/1496.html";   
getFullurl(sourceUrl, toConvertUrl, 2);
http://www.wed114.cn/jiehun/shishanghunli/hunlicehua/201106/1496.html
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics