Rust で Webスクレイピング(途中)

完全に途中です。完成するかなこれ。。
ServoがWebスクレイピングライブラリ作ってくれてるなんて。

一部、ここ を参考にしています。

[dependencies]
html5ever = "*"
tendril = "*"
extern crate tendril;
extern crate html5ever;
pub mod scrap;
use std::io::{self, Write};
use std::default::Default;
use tendril::{ByteTendril, ReadExt};
use html5ever::driver::ParseOpts;
use html5ever::tokenizer::Attribute;
use html5ever::tree_builder::TreeBuilderOpts;
use html5ever::{parse, one_input, serialize};
use html5ever::rcdom::{RcDom, Handle, Element, ElementEnum, NodeEnum};
#[derive(Debug)]
pub struct ScrapHandle { handle: Handle,
}
impl ScrapHandle { pub fn get_elements_by_tag_name(self, tag_name: &str) -> Vec<ScrapNode> { let mut nodes = vec![]; ScrapHandle::_get_elements_by_tag_name(self.handle, tag_name, &mut nodes); nodes } fn _get_elements_by_tag_name(handle: Handle, element_name: &str, out: &mut Vec<ScrapNode>) { let node = handle.borrow(); if let Element(ref name, _, ref attrs) = node.node { if &*name.local == element_name { out.push(ScrapNode { node: Element(name.clone(), ElementEnum::Normal, attrs.clone()), }); } } for child in &node.children { ScrapHandle::_get_elements_by_tag_name(child.clone(), element_name, out); } }
}
#[derive(Debug)]
pub struct ScrapNode { node: NodeEnum,
}
impl ScrapNode { pub fn attr(self, target_name: &str) -> String { let mut ret = "".to_owned(); if let Element(_, _, ref attrs) = self.node { for attr in attrs.iter() { let Attribute { ref name, ref value } = *attr; if &*name.local == target_name { ret = value.to_string() } } } ret }
}
pub fn parse_html(text: String) -> ScrapHandle { let mut source = ByteTendril::new(); text.as_bytes().read_to_tendril(&mut source).unwrap(); let source = source.try_reinterpret().unwrap(); let dom: RcDom = parse(one_input(source), Default::default()); ScrapHandle { handle: dom.document }
}
#[test]
fn test_parse_html() { let handle = parse_html("<div>hoge</div>".to_owned());
}
#[test]
fn test_get_element_by_tag_name() { let handle = parse_html("<div><a href=\"hoge\">hoge</a></div>".to_owned()); // println!("{:?}", handle); let mut nodes = handle.get_elements_by_tag_name("a"); for node in nodes { // println!("{:?}", node); println!("-----> {}", node.attr("href")); }
}