aboutsummaryrefslogtreecommitdiff
path: root/src/lib.rs
blob: 593ad27f5f8d1b8e50d0c2e7a194a24f4dde11e6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#[macro_use]
extern crate serde_derive;
extern crate bincode;
#[macro_use]
extern crate error_chain;
extern crate quick_xml;
extern crate bzip2;

mod page;
mod abstractdoc;
mod xmlutil;
mod errors;
mod revision;

pub use xmlutil::FromXml;
pub use abstractdoc::AbstractDoc;
pub use page::{Page, PageIter};
pub use revision::Revision;

// use bzip2::read::BzDecoder;

#[cfg(test)]
mod tests {
    use quick_xml::reader::Reader;
    use quick_xml::events::Event;
    use super::*;
    use bincode::{serialize, Infinite};
    use std::str;

    #[test]
    fn enwp_page() {
        let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap();
        let pages = PageIter::new(xml);

        for page in pages {
            //if page.namespace == 10 {
            //    println!("{}", page.title);
            //}
            //assert!(page.namespace < 10000);
            println!("{}\n(~END~)", page.revision.text);
        }
    }

    #[test]
    #[ignore]
    fn enwp_abstract() {
        let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap();
        let mut buf = Vec::new();
        let mut collect = Vec::new();

        loop {
            match xml.read_event(&mut buf) {
                Ok(Event::Start(ref e)) => {
                    match e.name() {
                        b"doc" => {
                            match AbstractDoc::from_xml(&mut xml) {
                                Ok(abs) => {
                                    //println!("{:#?}", abs);
                                    assert!(abs.title != "");
                                    assert!(abs.url != "");
                                    collect.push(abs);
                                    //assert!(abs.abstract_text != "");
                                    //assert!(abs.links.len() > 0);
                                },
                                Err(err) => panic!("Abstract XML: {}", err),
                            }
                        },
                        _ => (),
                    }
                },
                Ok(Event::Eof) => break,
                Err(err) => panic!("parse error {}", err),
                _ => (),
            }
        }

        let encoded: Vec<u8> = serialize(&collect, Infinite).unwrap();
        println!("Encoded size: {}", encoded.len());
    }
}