diff options
Diffstat (limited to 'src/lib.rs')
-rw-r--r-- | src/lib.rs | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c836159 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,77 @@ +#[macro_use] +extern crate serde_derive; +extern crate bincode; +#[macro_use] +extern crate error_chain; +extern crate quick_xml; +extern crate bzip2; + +mod page; +mod abstractdoc; +mod xmlutil; +mod errors; +mod revision; + +pub use xmlutil::FromXml; +pub use abstractdoc::AbstractDoc; +pub use page::{Page, PageIter}; +pub use revision::Revision; + +// use bzip2::read::BzDecoder; + +#[cfg(test)] +mod tests { + use quick_xml::reader::Reader; + use quick_xml::events::Event; + use super::*; + use bincode::{serialize, Infinite}; + use std::str; + + #[test] + fn enwp_page() { + let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap(); + let pages = PageIter::new(xml); + + for page in pages { + assert!(page.namespace < 10000); + //println!("{:#?}", page); + } + } + + #[test] + #[ignore] + fn enwp_abstract() { + let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap(); + let mut buf = Vec::new(); + let mut collect = Vec::new(); + + loop { + match xml.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"doc" => { + match AbstractDoc::from_xml(&mut xml) { + Ok(abs) => { + //println!("{:#?}", abs); + assert!(abs.title != ""); + assert!(abs.url != ""); + collect.push(abs); + //assert!(abs.abstract_text != ""); + //assert!(abs.links.len() > 0); + }, + Err(err) => panic!("Abstract XML: {}", err), + } + }, + _ => (), + } + }, + Ok(Event::Eof) => break, + Err(err) => panic!("parse error {}", err), + _ => (), + } + } + + let encoded: Vec<u8> = serialize(&collect, Infinite).unwrap(); + println!("Encoded size: {}", encoded.len()); + } +} |