当前位置:   article > 正文

富文本编辑器过滤XSS注入(JSOUP)_js富文本防止注入攻击

js富文本防止注入攻击

众所周知,让用户在富文本编辑器中进行自己的输入绝对不是一个明智的选择,但是有的时候又没有办法,所以只有一条原则来保证系统的安全性,那就是我们让用户输入什么,用户才能输入什么,而不是用户想输入什么,他就能输入什么,这样才能让系统处于我们的掌控,不至于出现各种娄子,比如各种XSS注入什么的。

后来我们发现有一个比较好用的东西就是JSOUP,这是一个能够对输入的html进行过滤,简单来说就是可以增加白名单和黑名单(基于正则表达式),白名单就是只允许一个html标签上有固定的属性,比如我们只允许<div height="100" >,即div上只允许有height属性,其他的都是非法的我们认为,就可以用jsoup设置白名单进行过滤。我们也可以设置黑名单,即我们觉得<div>标签什么属性都可以有,但是style标签我们不能控制,认为他是个黑名单,我们也可以用jsoup进行实现。


下面贴出一个样例:

  1. import java.io.File;
  2. import java.io.FileInputStream;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.io.StringWriter;
  6. import java.io.Writer;
  7. import java.util.ArrayList;
  8. import java.util.HashMap;
  9. import java.util.List;
  10. import java.util.Map;
  11. import java.util.regex.Matcher;
  12. import java.util.regex.Pattern;
  13. import net.sf.json.JSONObject;
  14. import net.sf.json.JsonConfig;
  15. import org.apache.commons.io.IOUtils;
  16. import org.jsoup.Jsoup;
  17. import org.jsoup.nodes.Document;
  18. import org.jsoup.nodes.Document.OutputSettings;
  19. import org.jsoup.nodes.Element;
  20. import org.jsoup.safety.Whitelist;
  21. import org.jsoup.select.Elements;
  22. import org.springframework.core.io.ClassPathResource;
  23. import org.springframework.core.io.Resource;
  24. import org.springside.modules.mapper.JsonMapper;
  25. public class HTMLStringFilter {
  26. private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以<开头以>结尾的标签
  27. private final static String PICTURE = "[图片]";
  28. //private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>"; // 找出IMG标签
  29. //private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\""; // 找出IMG标签的SRC属性
  30. public HTMLStringFilter() {
  31. }
  32. public static String HTMLEncode(String fString){
  33. fString=fString.replaceAll(" <", "<");
  34. fString=fString.replaceAll(">", ">");
  35. fString=fString.replaceAll(new String(new char[]{32}), " ");
  36. fString=fString.replaceAll(new String(new char[]{9}), " ");
  37. fString=fString.replaceAll(new String(new char[]{34}), """);
  38. fString=fString.replaceAll(new String(new char[]{39}), "'");
  39. fString=fString.replaceAll(new String(new char[]{13}), "");
  40. fString=fString.replaceAll(new String(new char[]{10,10}), " </p> <p>");
  41. fString=fString.replaceAll(new String(new char[]{10}), " <br>");
  42. return fString;
  43. }
  44. /**
  45. * xss escape
  46. */
  47. public static String xssEscape(String input) {
  48. return input == null ? null : input.replaceAll("<", "<")
  49. .replaceAll(">", ">")
  50. // .replaceAll("eval\\((.*)\\)", "")
  51. // .replaceAll("[\"'][\\s]*((?i)javascript):(.*)[\"']", "\"\"")
  52. // .replaceAll("((?i)script)", "")
  53. ;
  54. }
  55. /**
  56. * 除指定标签之外的html标签编码
  57. * @param str
  58. * @param tag
  59. * @return
  60. */
  61. public static String xssEscapeExceptTag(String str,String tag) {
  62. String replaceTag="@"+tag+"@";
  63. str=str.replaceAll("<"+tag,replaceTag );
  64. str=xssEscape(str);
  65. str=str.replaceAll(replaceTag, "<"+tag);
  66. return str;
  67. }
  68. public static void main(String[] args){
  69. // System.out.println(new java.util.Date().getTime());
  70. // System.out.println(HTMLStringFilter.filterSafe("< script >ddd</div>"));
  71. // System.out.println(HTMLStringFilter.filterSafe("< div >ddd</div>"));
  72. // System.out.println("======"+HTMLStringFilter.filterSafe("< div oncliCk=''><img src='http://s.jsp'/>ddd</div>"));
  73. //
  74. // String imgHTML="<img src=\"http:\"/>";
  75. // String tag="img";
  76. // System.out.println("filter except:"+filterHtmlExceptTag(imgHTML, tag));
  77. //
  78. // System.out.println(new java.util.Date().getTime());
  79. //
  80. // String source="aaaaa<img alt=\"[可爱]\" src=\"http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/14/tza_thumb.gif\" height=\"22\" width=\"22\" />bbbb<img alt=\"[给力]\" src=\"http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c9/geili_thumb.gif\" height=\"22\" width=\"22\" />ccc";
  81. // String title=replaceTag(source, "img", "alt");
  82. // System.out.println("title=="+title);
  83. //
  84. // String s="<img src=\"http://img7.9158.com/200708/10/09/18/200708103758836.jpg\"/>";
  85. // List<String> srcs=match(source, "img", "src");
  86. // if (CollectionUtils.isNotEmpty(srcs)) {
  87. // for (String att : srcs) {
  88. // System.out.println("attr=="+att);
  89. // }
  90. // }
  91. //
  92. // System.out.println("html标签替换=="+replaceHtmlTagOfText(s, "img", "[图片]"));
  93. //
  94. String htmlStr="<html>bb<img style='display:inline;' alt='[挤眼]' src='http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c3/zy_thumb.gif' height='22' width='22' />bb<img style='display:inline;' alt='[挤眼]' src='http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c3/zy_thumb.gif' height='22' width='22' />aaaa</html>";
  95. List<String> srcs=getImgHTML(htmlStr);
  96. for (String src : srcs) {
  97. System.out.println("======="+src);
  98. }
  99. // System.out.println("=HTMLEncode=="+);
  100. // List<String> htmls=getImgHTML(htmlStr);
  101. // List<String> srcs=getImgSrc(htmlStr);
  102. //
  103. // System.out.println("--"+htmls.size()+"=="+srcs.size());
  104. //
  105. // for (String s : htmls) {
  106. // System.out.println("----"+s);
  107. // System.out.print(htmlStr.replaceFirst(s, "[图一]"));
  108. // }
  109. // for (String s : srcs) {
  110. // System.out.println("==="+s);
  111. // }
  112. }
  113. /**
  114. * 过滤一下字符串,连同前后< xxx >yyy< / xxx >全部消除。
  115. * 不区分大小写、空格可识别
  116. * <br>"function", "window\\.", "javascript:", "script",
  117. * <br>"js:", "about:", "file:", "document\\.", "vbs:", "frame",
  118. * <br>"cookie", "onclick", "onfinish", "onmouse", "onexit=",
  119. * <br>"onerror", "onclick", "onkey", "onload", "onfocus", "onblur"
  120. * @param htmlStr
  121. * @return
  122. */
  123. public static String filterSafe(String htmlStr){
  124. Pattern p = null; // 正则表达式
  125. Matcher m = null; // 操作的字符串
  126. StringBuffer tmp = null;
  127. String str = "";
  128. boolean isHave = false;
  129. String[] Rstr = { "meta", "script", "object", "embed" };
  130. if (htmlStr == null || !(htmlStr.length() > 0)) {
  131. return "";
  132. }
  133. str = htmlStr.toLowerCase();
  134. for (int i = 0; i < Rstr.length; i++) {
  135. p = Pattern.compile("<" + Rstr[i] + "(.[^>])*>");
  136. m = p.matcher(str);
  137. tmp = new StringBuffer();
  138. if (m.find()) {
  139. m.appendReplacement(tmp, "<" + Rstr[i] + ">");
  140. while (m.find()) {
  141. m.appendReplacement(tmp, "<" + Rstr[i] + ">");
  142. }
  143. isHave = true;
  144. }
  145. m.appendTail(tmp);
  146. str = tmp.toString();
  147. p = Pattern.compile("</" + Rstr[i] + "(.[^>])*>");
  148. m = p.matcher(str);
  149. tmp = new StringBuffer();
  150. if (m.find()) {
  151. m.appendReplacement(tmp, "</" + Rstr[i] + ">");
  152. while (m.find()) {
  153. m.appendReplacement(tmp, "</" + Rstr[i] + ">");
  154. }
  155. isHave = true;
  156. }
  157. m.appendTail(tmp);
  158. str = tmp.toString();
  159. }
  160. // System.out.println(str);
  161. String[] Rstr1 = { "function", "window\\.", "javascript:", "script",
  162. "js:", "about:", "file:", "document\\.", "vbs:", "frame",
  163. "cookie", "onclick", "onfinish", "onmouse", "onexit=",
  164. "onerror", "onclick", "onkey", "onload", "onfocus", "onblur" };
  165. for (int i = 0; i < Rstr1.length; i++) {
  166. p = Pattern.compile("<([^<>])*" + Rstr1[i] + "([^<>])*>([^<>])*</([^<>])*>");
  167. m = p.matcher(str);
  168. tmp = new StringBuffer();
  169. if (m.find()) {
  170. m.appendReplacement(tmp, "");
  171. while (m.find()) {
  172. m.appendReplacement(tmp, "");
  173. }
  174. isHave = true;
  175. }
  176. m.appendTail(tmp);
  177. str = tmp.toString();
  178. }
  179. if (isHave) {
  180. htmlStr = str;
  181. }
  182. htmlStr = htmlStr.replaceAll("%3C", "<");
  183. htmlStr = htmlStr.replaceAll("%3E", ">");
  184. htmlStr = htmlStr.replaceAll("%2F", "");
  185. htmlStr = htmlStr.replaceAll("&#", "<b>&#</b>");
  186. return htmlStr;
  187. }
  188. /**
  189. * 采用jsoup白名单方式过滤非法的html字符。
  190. * 原理:
  191. * 1.首先通过白名单过滤掉非法的html标签,即只允许输出白名单内的标签
  192. * 2.对特殊的属性(主要是style)用正则过滤,只允许安全的属性值存在
  193. * @param htmlStr 原始的html片段(用户通过富文本编辑器提交的html代码)
  194. * @return 过滤后的安全的html片段
  195. */
  196. public static String cleanSafeHtml(String htmlStr) {
  197. Document doc = Jsoup.parseBodyFragment(htmlStr);
  198. OutputSettings outSet = new OutputSettings();
  199. outSet.prettyPrint(false);
  200. outSet.outline(false);
  201. doc.outputSettings(outSet);
  202. Map<String, String> regexMap = initRegexMap();
  203. if (regexMap != null) {
  204. for (Map.Entry<String,String> entiy:regexMap.entrySet()){
  205. String key = entiy.getKey();
  206. Elements els = doc.select(key);
  207. for (Element el:els) {
  208. System.out.println("old el:"+el.toString());
  209. String attribute = key.substring(key.indexOf("[")+1, key.indexOf("]"));
  210. String attributeValue = el.attr(attribute);
  211. Matcher valueMatcher = Pattern.compile(entiy.getValue()).matcher(attributeValue);
  212. if (valueMatcher.find()) {
  213. String safeValue = valueMatcher.group();
  214. System.out.println("safeValue:"+safeValue);
  215. el.attr(attribute, safeValue);
  216. }
  217. System.out.println("new el:"+el.toString());
  218. }
  219. }
  220. }
  221. Whitelist whitelist = initWhiteList();
  222. String safeString = Jsoup.clean(doc.html(), "", whitelist);
  223. System.out.println("safestring:"+safeString);
  224. return safeString;
  225. // Elements els = doc.select("[style]");
  226. // for (Element el:els) {
  227. // System.out.println("old el:"+el.toString());
  228. // String styleattribute = el.attr("style");
  229. // Matcher styleMatcher = Pattern.compile(styleAttributeRegex).matcher(styleattribute);
  230. // if (styleMatcher.find()) {
  231. // String safeStyle = styleMatcher.group();
  232. // System.out.println("safeStyle:"+safeStyle);
  233. // el.attr("style", safeStyle);
  234. // }
  235. // System.out.println("new el:"+el.toString());
  236. // }
  237. // Whitelist whitelist = Whitelist.relaxed();
  238. // whitelist.addAttributes("span", "style");
  239. // String safeString = Jsoup.clean(doc.html(), "", whitelist);
  240. // System.out.println("safestring:"+safeString);
  241. // return safeString;
  242. }
  243. private static Whitelist whitelist = null;
  244. private static Whitelist initWhiteList() {
  245. if (whitelist == null) {
  246. synchronized(new Object()) {
  247. whitelist = new Whitelist();
  248. String jsonString = null;
  249. Resource resource = new ClassPathResource("/data/whitelist.conf");
  250. File file = null;
  251. InputStream input = null;
  252. Writer output = null;
  253. try {
  254. file = resource.getFile();
  255. input = new FileInputStream(file);
  256. output = new StringWriter();
  257. IOUtils.copy(input, output);
  258. jsonString = output.toString();
  259. } catch (IOException e) {
  260. // TODO Auto-generated catch block
  261. e.printStackTrace();
  262. }finally {
  263. if (input != null) {
  264. IOUtils.closeQuietly(input);
  265. }
  266. if (output != null) {
  267. IOUtils.closeQuietly(output);
  268. }
  269. }
  270. JsonConfig config = new JsonConfig();
  271. config.setIgnoreDefaultExcludes(true);//这里不设置,会把class属性过滤掉
  272. JSONObject jsonObject = JSONObject.fromObject(jsonString,config);
  273. JSONObject whitelistjson = jsonObject.getJSONObject("whiteList");
  274. JSONObject protocolsjson = jsonObject.getJSONObject("protocols");
  275. JsonMapper newMapper = new JsonMapper();
  276. Map<String, Map<String, String>> whitelistmap = newMapper.fromJson(whitelistjson.toString(), HashMap.class);
  277. Map<String, List<String>> protocolsmap = newMapper.fromJson(protocolsjson.toString(), HashMap.class);
  278. for (Map.Entry<String, Map<String, String>> entiy:whitelistmap.entrySet()){
  279. String tag = entiy.getKey();
  280. whitelist.addTags(tag);
  281. for (Map.Entry<String,String> entiy2:entiy.getValue().entrySet()){
  282. String attribute = entiy2.getKey();
  283. whitelist.addAttributes(tag, attribute);
  284. System.out.println("value value:"+entiy2.getValue());
  285. }
  286. }
  287. for (Map.Entry<String, List<String>> entiy:protocolsmap.entrySet()){
  288. String tag = entiy.getKey().substring(0, entiy.getKey().indexOf("."));
  289. String key = entiy.getKey().substring(entiy.getKey().indexOf(".")+1, entiy.getKey().length());
  290. for (String entiy2:entiy.getValue()){
  291. whitelist.addProtocols(tag, key, entiy2);
  292. }
  293. }
  294. }
  295. }
  296. return whitelist;
  297. }
  298. private static Map<String, String> regexMap = null;
  299. private static Map<String, String> initRegexMap() {
  300. if (regexMap == null) {
  301. synchronized (new Object()) {
  302. regexMap = new HashMap<String, String>();
  303. String jsonString = null;
  304. Resource resource = new ClassPathResource("/data/whitelist.conf");
  305. File file = null;
  306. InputStream input = null;
  307. Writer output = null;
  308. try {
  309. file = resource.getFile();
  310. input = new FileInputStream(file);
  311. output = new StringWriter();
  312. IOUtils.copy(input, output);
  313. jsonString = output.toString();
  314. } catch (IOException e) {
  315. // TODO Auto-generated catch block
  316. e.printStackTrace();
  317. }finally {
  318. if (input != null) {
  319. IOUtils.closeQuietly(input);
  320. }
  321. if (output != null) {
  322. IOUtils.closeQuietly(output);
  323. }
  324. }
  325. JSONObject jsonObject = JSONObject.fromObject(jsonString);
  326. JSONObject whitelistjson = jsonObject.getJSONObject("whiteList");
  327. JsonMapper newMapper = new JsonMapper();
  328. Map<String, Map<String, String>> whitelistmap = newMapper.fromJson(whitelistjson.toString(), HashMap.class);
  329. for (Map.Entry<String, Map<String, String>> entiy:whitelistmap.entrySet()){
  330. String tag = entiy.getKey();
  331. for (Map.Entry<String,String> entiy2:entiy.getValue().entrySet()){
  332. String attribute = entiy2.getKey();
  333. String attributeValue = entiy2.getValue();
  334. if (attributeValue != null && attributeValue.trim().length() > 0) {
  335. regexMap.put(tag+"["+ attribute +"]", attributeValue);
  336. }
  337. }
  338. }
  339. }
  340. }
  341. return regexMap;
  342. }
  343. public static String filter(String input) {
  344. if (!hasSpecialChars(input)) {
  345. return input;
  346. }
  347. StringBuffer filtered = new StringBuffer(input.length());
  348. char c;
  349. for (int i = 0; i <= input.length() - 1; i++) {
  350. c = input.charAt(i);
  351. switch (c) {
  352. case '<':
  353. filtered.append("<");
  354. break;
  355. case '>':
  356. filtered.append(">");
  357. break;
  358. case '"':
  359. filtered.append("&uot;");
  360. break;
  361. case '&':
  362. filtered.append("&");
  363. break;
  364. default:
  365. filtered.append(c);
  366. }
  367. }
  368. return (filtered.toString());
  369. }
  370. public static boolean hasSpecialChars(String input) {
  371. boolean flag = false;
  372. if ((input != null) && (input.length() > 0)) {
  373. char c;
  374. for (int i = 0; i <= input.length() - 1; i++) {
  375. c = input.charAt(i);
  376. switch (c) {
  377. case '>':
  378. flag = true;
  379. break;
  380. case '<':
  381. flag = true;
  382. break;
  383. case '"':
  384. flag = true;
  385. break;
  386. case '&':
  387. flag = true;
  388. break;
  389. }
  390. }
  391. }
  392. return flag;
  393. }
  394. /**
  395. *
  396. * 基本功能:过滤所有以"<"开头以">"结尾的标签
  397. * <p>
  398. *
  399. * @param str
  400. * @return String
  401. */
  402. public static String filterHtml(String str) {
  403. Pattern pattern = Pattern.compile(regxpForHtml);
  404. Matcher matcher = pattern.matcher(str);
  405. StringBuffer sb = new StringBuffer();
  406. boolean result1 = matcher.find();
  407. while (result1) {
  408. matcher.appendReplacement(sb, "");
  409. result1 = matcher.find();
  410. }
  411. matcher.appendTail(sb);
  412. return sb.toString();
  413. }
  414. /**
  415. * 过滤除指定tag之外的html标签
  416. * @param str
  417. * @param tag
  418. * @return
  419. */
  420. public static String filterHtmlExceptTag(String str,String tag) {
  421. String replaceTag="@"+tag+"@";
  422. str=str.replaceAll("<"+tag,replaceTag );
  423. str=filterHtml(str);
  424. str=str.replaceAll(replaceTag, "<"+tag);
  425. return str;
  426. }
  427. /**
  428. *
  429. * 基本功能:过滤指定标签
  430. * <p>
  431. *
  432. * @param str
  433. * @param tag
  434. * 指定标签
  435. * @return String
  436. */
  437. public static String fiterHtmlTag(String str, String tag) {
  438. String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";
  439. Pattern pattern = Pattern.compile(regxp);
  440. Matcher matcher = pattern.matcher(str);
  441. StringBuffer sb = new StringBuffer();
  442. boolean result1 = matcher.find();
  443. while (result1) {
  444. matcher.appendReplacement(sb, "");
  445. result1 = matcher.find();
  446. }
  447. matcher.appendTail(sb);
  448. return sb.toString();
  449. }
  450. /**
  451. *
  452. * 基本功能:替换指定的标签
  453. * <p>
  454. *
  455. * @param str
  456. * @param beforeTag
  457. * 要替换的标签
  458. * @param tagAttrib
  459. * 要替换的标签属性值
  460. * @param startTag
  461. * 新标签开始标记
  462. * @param endTag
  463. * 新标签结束标记
  464. * @return String
  465. * @如:替换img标签的src属性值为[img]属性值[/img]
  466. */
  467. public static String replaceHtmlTag(String str, String beforeTag,
  468. String tagAttrib, String startTag, String endTag) {
  469. String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>";
  470. String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\"";
  471. Pattern patternForTag = Pattern.compile(regxpForTag);
  472. Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib);
  473. Matcher matcherForTag = patternForTag.matcher(str);
  474. StringBuffer sb = new StringBuffer();
  475. boolean result = matcherForTag.find();
  476. while (result) {
  477. StringBuffer sbreplace = new StringBuffer();
  478. Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag
  479. .group(1));
  480. if (matcherForAttrib.find()) {
  481. matcherForAttrib.appendReplacement(sbreplace, startTag
  482. + matcherForAttrib.group(1) + endTag);
  483. }
  484. matcherForTag.appendReplacement(sb, sbreplace.toString());
  485. result = matcherForTag.find();
  486. }
  487. matcherForTag.appendTail(sb);
  488. return sb.toString();
  489. }
  490. /**
  491. * html标签替换为指定字符
  492. * @param str
  493. * @param tagAttrib
  494. * @param beforeTag
  495. * @param replace
  496. * @return
  497. */
  498. public static String replaceHtmlTagOfText(String str,String tag,String text) {
  499. String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";
  500. Pattern pattern = Pattern.compile(regxp);
  501. Matcher matcher = pattern.matcher(str);
  502. StringBuffer sb = new StringBuffer();
  503. boolean result1 = matcher.find();
  504. while (result1) {
  505. matcher.appendReplacement(sb, text);
  506. result1 = matcher.find();
  507. }
  508. matcher.appendTail(sb);
  509. return sb.toString();
  510. }
  511. /**
  512. * 获取指定HTML标签的指定属性的值
  513. * @param source 要匹配的源文本
  514. * @param element 标签名称
  515. * @param attr 标签的属性名称
  516. * @return 属性值列表
  517. */
  518. public static List<String> match(String source, String element, String attr) {
  519. List<String> result = new ArrayList<String>();
  520. String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?\\s.*?>";
  521. Matcher m = Pattern.compile(reg).matcher(source);
  522. while (m.find()) {
  523. String r = m.group(1);
  524. result.add(r);
  525. }
  526. return result;
  527. }
  528. public static List<String> getImgHTML(String html) {
  529. List<String> resultList=new ArrayList<String>();
  530. Pattern p=Pattern.compile("<img ([^>]*)");//<img开头 >结尾
  531. Matcher m=p.matcher(html);//开始编译
  532. while (m.find()) {
  533. resultList.add("<img "+m.group(1)+">");//获取匹配的部分
  534. }
  535. return resultList;
  536. }
  537. public static List<String> getImgSrc(String htmlStr){
  538. String img="";
  539. Pattern p_image;
  540. Matcher m_image;
  541. List<String> pics = new ArrayList<String>();
  542. String regEx_img = "<img.*src=(.*?)[^>]*?>"; //图片链接地址
  543. p_image = Pattern.compile
  544. (regEx_img,Pattern.CASE_INSENSITIVE);
  545. m_image = p_image.matcher(htmlStr);
  546. while(m_image.find()){
  547. img = m_image.group();
  548. Matcher m = Pattern.compile("src=\"?(.*?)(\"|>|\\s+)").matcher(img); //匹配src
  549. while(m.find()){
  550. pics.add(m.group(1));
  551. }
  552. }
  553. return pics;
  554. }
  555. public static List<String> getImgAlt(String htmlStr){
  556. String img="";
  557. Pattern p_image;
  558. Matcher m_image;
  559. List<String> alts = new ArrayList<String>();
  560. String regEx_img = "<img.*src=(.*?)[^>]*?>"; //图片链接地址
  561. p_image = Pattern.compile
  562. (regEx_img,Pattern.CASE_INSENSITIVE);
  563. m_image = p_image.matcher(htmlStr);
  564. while(m_image.find()){
  565. img = m_image.group();
  566. Matcher m = Pattern.compile("alt=\"?(.*?)(\"|>|\\s+)").matcher(img); //匹配src
  567. while(m.find()){
  568. alts.add(m.group(1));
  569. }
  570. }
  571. return alts;
  572. }
  573. /**
  574. *
  575. * 基本功能:过滤所有以"<"开头以">"结尾的标签,但是替换为空格
  576. * <p>
  577. *
  578. * @param str
  579. * @return String
  580. */
  581. public static String filterHtmlWithSapce(String str) {
  582. Pattern pattern = Pattern.compile(regxpForHtml);
  583. Matcher matcher = pattern.matcher(str);
  584. StringBuffer sb = new StringBuffer();
  585. boolean result1 = matcher.find();
  586. while (result1) {
  587. matcher.appendReplacement(sb, " ");
  588. result1 = matcher.find();
  589. }
  590. matcher.appendTail(sb);
  591. return sb.toString();
  592. }
  593. }


并且贴出一个jsoup的白名单配置文件:

  1. {
  2. "whiteList":{
  3. "a":{"href":"","title":""},
  4. "b":{},
  5. "blockquote":{"cite":""},
  6. "br":{},
  7. "caption":{},
  8. "cite":{},
  9. "code":{},
  10. "col":{"span":"","width":""},
  11. "colgroup":{"span":"","width":""},
  12. "dd":{},
  13. "div":{},
  14. "dl":{},
  15. "dt":{},
  16. "em":{},
  17. "h1":{},
  18. "h2":{},
  19. "h3":{},
  20. "h4":{},
  21. "h5":{},
  22. "h6":{},
  23. "i":{},
  24. "img":{"align":"", "alt":"", "height":"", "src":"", "title":"", "width":""},
  25. "li":{"class":"","style":"/^text-align:\\s*(left|right|center);?\\s*$/i"},
  26. "ol":{"start":"", "type":""},
  27. "p":{"style":"/^text-align:\\s*(left|right|center);?\\s*$/i"},
  28. "pre":{},
  29. "q":{"cite":""},
  30. "small":{},
  31. "span":{"style":"/^\\s*font-family\\s*:\\s*(('|\\\"|"|')?(楷体|楷体_GB2312|宋体|微软雅黑|黑体|,|\\s|\\w|sans-serif)('|\\\"|"|')?)+;?\\s*|\\s*(color|font-size|background-color)\\s*:\\s*(#\\w*|[\\w\\s]*|rgb\\s*\\(\\s*\\d+\\s*,\\s*\\d+\\s*,\\s*\\d+\\s*\\));?\\s*|\\s*text-decoration\\s*:\\s*(underline|overline|line-through|blink)\\s*;?\\s*$/i"},
  32. "strike":{},
  33. "strong":{},
  34. "sub":{},
  35. "sup":{},
  36. "table":{"summary":"", "width":""},
  37. "tbody":{},
  38. "td":{"abbr":"", "axis":"", "colspan":"", "rowspan":"", "width":""},
  39. "tfoot":{},
  40. "th":{"abbr":"", "axis":"", "colspan":"", "rowspan":"", "scope":"","width":""},
  41. "thead":{},
  42. "tr":{},
  43. "u":{},
  44. "ul":{"type":"","class":"","style":"/^list-style-type:\\s*(decimal|disc);\\s*$/i"}
  45. },
  46. "protocols":{
  47. "a.href":["ftp", "http", "https", "mailto"],
  48. "blockquote.cite":["http", "https"],
  49. "cite.cite":["http", "https"],
  50. "img.src":["http", "https"],
  51. "q.cite":["http", "https"]
  52. }
  53. }

即每个标签的任何属性,属性的值我们都可以进行过滤和定制。

这样,用户输入的任何东西都可以得到我们的控制。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/338602
推荐阅读
相关标签
  

闽ICP备14008679号