aboutsummaryrefslogtreecommitdiff
path: root/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib.rs')
-rw-r--r--src/lib.rs77
1 files changed, 77 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..c836159
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,77 @@
+#[macro_use]
+extern crate serde_derive;
+extern crate bincode;
+#[macro_use]
+extern crate error_chain;
+extern crate quick_xml;
+extern crate bzip2;
+
+mod page;
+mod abstractdoc;
+mod xmlutil;
+mod errors;
+mod revision;
+
+pub use xmlutil::FromXml;
+pub use abstractdoc::AbstractDoc;
+pub use page::{Page, PageIter};
+pub use revision::Revision;
+
+// use bzip2::read::BzDecoder;
+
+#[cfg(test)]
+mod tests {
+ use quick_xml::reader::Reader;
+ use quick_xml::events::Event;
+ use super::*;
+ use bincode::{serialize, Infinite};
+ use std::str;
+
+ #[test]
+ fn enwp_page() {
+ let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap();
+ let pages = PageIter::new(xml);
+
+ for page in pages {
+ assert!(page.namespace < 10000);
+ //println!("{:#?}", page);
+ }
+ }
+
+ #[test]
+ #[ignore]
+ fn enwp_abstract() {
+ let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap();
+ let mut buf = Vec::new();
+ let mut collect = Vec::new();
+
+ loop {
+ match xml.read_event(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"doc" => {
+ match AbstractDoc::from_xml(&mut xml) {
+ Ok(abs) => {
+ //println!("{:#?}", abs);
+ assert!(abs.title != "");
+ assert!(abs.url != "");
+ collect.push(abs);
+ //assert!(abs.abstract_text != "");
+ //assert!(abs.links.len() > 0);
+ },
+ Err(err) => panic!("Abstract XML: {}", err),
+ }
+ },
+ _ => (),
+ }
+ },
+ Ok(Event::Eof) => break,
+ Err(err) => panic!("parse error {}", err),
+ _ => (),
+ }
+ }
+
+ let encoded: Vec<u8> = serialize(&collect, Infinite).unwrap();
+ println!("Encoded size: {}", encoded.len());
+ }
+}