完全に途中です。完成するかなこれ。。
ServoがWebスクレイピングライブラリ作ってくれてるなんて。
一部、ここ を参考にしています。
[dependencies] html5ever = "*" tendril = "*"
extern crate tendril; extern crate html5ever; pub mod scrap;
use std::io::{self, Write}; use std::default::Default; use tendril::{ByteTendril, ReadExt}; use html5ever::driver::ParseOpts; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::TreeBuilderOpts; use html5ever::{parse, one_input, serialize}; use html5ever::rcdom::{RcDom, Handle, Element, ElementEnum, NodeEnum}; #[derive(Debug)] pub struct ScrapHandle { handle: Handle, } impl ScrapHandle { pub fn get_elements_by_tag_name(self, tag_name: &str) -> Vec<ScrapNode> { let mut nodes = vec![]; ScrapHandle::_get_elements_by_tag_name(self.handle, tag_name, &mut nodes); nodes } fn _get_elements_by_tag_name(handle: Handle, element_name: &str, out: &mut Vec<ScrapNode>) { let node = handle.borrow(); if let Element(ref name, _, ref attrs) = node.node { if &*name.local == element_name { out.push(ScrapNode { node: Element(name.clone(), ElementEnum::Normal, attrs.clone()), }); } } for child in &node.children { ScrapHandle::_get_elements_by_tag_name(child.clone(), element_name, out); } } } #[derive(Debug)] pub struct ScrapNode { node: NodeEnum, } impl ScrapNode { pub fn attr(self, target_name: &str) -> String { let mut ret = "".to_owned(); if let Element(_, _, ref attrs) = self.node { for attr in attrs.iter() { let Attribute { ref name, ref value } = *attr; if &*name.local == target_name { ret = value.to_string() } } } ret } } pub fn parse_html(text: String) -> ScrapHandle { let mut source = ByteTendril::new(); text.as_bytes().read_to_tendril(&mut source).unwrap(); let source = source.try_reinterpret().unwrap(); let dom: RcDom = parse(one_input(source), Default::default()); ScrapHandle { handle: dom.document } } #[test] fn test_parse_html() { let handle = parse_html("<div>hoge</div>".to_owned()); } #[test] fn test_get_element_by_tag_name() { let handle = parse_html("<div><a href=\"hoge\">hoge</a></div>".to_owned()); // println!("{:?}", handle); let mut nodes = handle.get_elements_by_tag_name("a"); for node in nodes { // println!("{:?}", node); println!("-----> {}", node.attr("href")); } }