提取链接
从网页 HTML 中,提取所有链接
使用reqwest::get
,去执行一个 HTTP GET 请求,然后使用Document::from_read
将响应解析为 HTML 文档。拿find
,配上Name
标准“a”,检索所有链接。在Selection
上,调用filter_map
留下链接中 URL,包含“href”attr
的。
# #[macro_use] # extern crate error_chain; extern crate reqwest; extern crate select; use select::document::Document; use select::predicate::Name; # # error_chain! { # foreign_links { # ReqError(reqwest::Error); # IoError(std::io::Error); # } # } fn run() -> Result<()> { let res = reqwest::get("https://www.rust-lang.org/en-US/")?; Document::from_read(res)? .find(Name("a")) .filter_map(|n| n.attr("href")) .for_each(|x| println!("{}", x)); Ok(()) } # # quick_main!(run);
检查网页是否,有断开的链接
调用get_base_url
以检索 base URL。如果文档有 base 标签,从 base 标签获取attr
。Position::BeforePath
作为 原始 URL 的 默认行为。
遍历文档中的链接,并用url::ParseOptions
和Url::parse
解析。用 reqwest
对链接做请求,并验证StatusCode
。
# #[macro_use] # extern crate error_chain; extern crate reqwest; extern crate select; extern crate url; use std::collections::HashSet; use url::{Url, Position}; use reqwest::StatusCode; use select::document::Document; use select::predicate::Name; # # error_chain! { # foreign_links { # ReqError(reqwest::Error); # IoError(std::io::Error); # UrlParseError(url::ParseError); # } # } fn get_base_url(url: &Url, doc: &Document) -> Result<Url> { let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0); let base_url = base_tag_href.map_or_else( || Url::parse(&url[..Position::BeforePath]), Url::parse, )?; Ok(base_url) } fn check_link(url: &Url) -> Result<bool> { let res = reqwest::get(url.as_ref())?; Ok(res.status() != StatusCode::NOT_FOUND) } fn run() -> Result<()> { let url = Url::parse("https://www.rust-lang.org/en-US/")?; let res = reqwest::get(url.as_ref())?; let document = Document::from_read(res)?; let base_url = get_base_url(&url, &document)?; let base_parser = Url::options().base_url(Some(&base_url)); let links: HashSet<Url> = document .find(Name("a")) .filter_map(|n| n.attr("href")) .filter_map(|link| base_parser.parse(link).ok()) .collect(); links .iter() .filter(|link| check_link(link).ok() == Some(false)) .for_each(|x| println!("{} is broken.", x)); Ok(()) } # # quick_main!(run);
从 Mediawiki markup 中,提取所有独一链接
使用reqwest::get
,拉取 MediaWiki 源页面,然后Regex::captures_iter
查找内部和外部链接的所有条目。使用Cow
避免过度String
分配 (单一)。
Mediawiki 链接语法,在这里有所描述。
# #[macro_use] # extern crate error_chain; #[macro_use] extern crate lazy_static; extern crate reqwest; extern crate regex; use std::io::Read; use std::collections::HashSet; use std::borrow::Cow; use regex::Regex; # error_chain! { # foreign_links { # Io(std::io::Error); # Reqwest(reqwest::Error); # Regex(regex::Error); # } # } # fn extract_links(content: &str) -> Result<HashSet<Cow<str>>> { lazy_static! { static ref WIKI_REGEX: Regex = Regex::new(r"(?x) \[\[(?P<internal>[^\[\]|]*)[^\[\]]*\]\] # internal links | (url=|URL\||\[)(?P<external>http.*?)[ \|}] # external links ").unwrap(); } let links: HashSet<_> = WIKI_REGEX .captures_iter(content) .map(|c| match (c.name("internal"), c.name("external")) { (Some(val), None) => Cow::from(val.as_str().to_lowercase()), (None, Some(val)) => Cow::from(val.as_str()), _ => unreachable!(), }) .collect(); Ok(links) } fn run() -> Result<()> { let mut content = String::new(); reqwest::get( "https://en.wikipedia.org/w/index.php?title=Rust_(programming_language)&action=raw", )? .read_to_string(&mut content)?; println!("{:#?}", extract_links(&content)?); Ok(()) } # # quick_main!(run);