很楠不爱3

这个屌丝很懒，什么也没留下！

热门标签

html获取页面标签,获取html页面全部标签或者标签内容

作者：很楠不爱3 | 2024-02-18 21:16:00

踩

html获取外层标签

首先是两个正则表达式：

1.]+>：这个正则表达式可以匹配所有html标签,可以100%匹配(注意页面编码方式和读取的编码方式)。

2.>[^

下面上程序：

import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.net.URL;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class URLTest {

/**

* @param args

* @throws URISyntaxException

public static void main(String[] args) throws Exception {

URL url = new URL("http://www.ascii-code.com/");

InputStreamReader reader = new InputStreamReader(url.openStream());

BufferedReader br = new BufferedReader(reader);

String s = null;

while((s=br.readLine())!=null){

s = GetLabel(s);

if(s!=null){

System.out.println(s);

}

br.close();

reader.close();

}

public static String GetContent(String html) {

//String html = "

1.hehe
2.hi
3.hei

String ss = ">[^

String temp = null;

Pattern pa = Pattern.compile(ss);

Matcher ma = null;

ma = pa.matcher(html);

String result = null;

while(ma.find()){

temp = ma.group();

if(temp!=null){

if(temp.startsWith(">")){

temp = temp.substring(1);

}

if(temp.endsWith("

temp = temp.substring(0, temp.length()-1);

}

if(!temp.equalsIgnoreCase("")){

if(result==null){

result = temp;

}

else{

result+="____"+temp;

}

return result;

}

public static String GetLabel(String html) {