#[macro_use] extern crate serde_derive; extern crate bincode; #[macro_use] extern crate error_chain; extern crate quick_xml; extern crate bzip2; mod page; mod abstractdoc; mod xmlutil; mod errors; mod revision; pub use xmlutil::FromXml; pub use abstractdoc::AbstractDoc; pub use page::{Page, PageIter}; pub use revision::Revision; // use bzip2::read::BzDecoder; #[cfg(test)] mod tests { use quick_xml::reader::Reader; use quick_xml::events::Event; use super::*; use bincode::{serialize, Infinite}; use std::str; #[test] fn enwp_page() { let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap(); let pages = PageIter::new(xml); for page in pages { //if page.namespace == 10 { // println!("{}", page.title); //} //assert!(page.namespace < 10000); println!("{}\n(~END~)", page.revision.text); } } #[test] #[ignore] fn enwp_abstract() { let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap(); let mut buf = Vec::new(); let mut collect = Vec::new(); loop { match xml.read_event(&mut buf) { Ok(Event::Start(ref e)) => { match e.name() { b"doc" => { match AbstractDoc::from_xml(&mut xml) { Ok(abs) => { //println!("{:#?}", abs); assert!(abs.title != ""); assert!(abs.url != ""); collect.push(abs); //assert!(abs.abstract_text != ""); //assert!(abs.links.len() > 0); }, Err(err) => panic!("Abstract XML: {}", err), } }, _ => (), } }, Ok(Event::Eof) => break, Err(err) => panic!("parse error {}", err), _ => (), } } let encoded: Vec = serialize(&collect, Infinite).unwrap(); println!("Encoded size: {}", encoded.len()); } }