jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.
/* 根据Document对象获取元素 */ //1:通过id获取元素:获取博客头标题 Element element = doc.getElementById("header"); System.out.println(element.text());
//2:通过tag获取元素:获取博客文章标题 Elements elements = doc.getElementsByTag("h3"); for (Element e : elements) { System.out.println(e.text()); } //3:通过class获取元素:获取博客文章简介 Elements elements2 = doc.getElementsByClass("post-content"); for (Element e : elements2) { System.out.println(e.text() + "\n"); } //4:通过attribute获取元素:获取博客文章发布时间 Elements elements3 = doc.getElementsByAttribute("datetime"); for (Element e : elements3) { System.out.println(e.text()); } //4-2:通过属性名加属性值筛选元素:获取博客文章标题 Elements elements4 = doc.getElementsByAttributeValue("itemprop", "name"); for (Element e : elements4) { System.out.println(e.text()); }
/* 获取元素中的数据 */ Element e = doc.getElementById("header"); //1:从元素中获取id System.out.println(e.id()); //2:从元素中获取文本内容text System.out.println(e.text()); //3:从元素中获取className System.out.println(e.className()); //4:从元素中获取属性的值attr System.out.println(e.attr("id")); //5:从元素中获取所有属性attributes System.out.println(e.attributes().toString()); // ······