1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
#[macro_use]
extern crate serde_derive;
extern crate bincode;
#[macro_use]
extern crate error_chain;
extern crate quick_xml;
extern crate bzip2;
mod page;
mod abstractdoc;
mod xmlutil;
mod errors;
mod revision;
pub use xmlutil::FromXml;
pub use abstractdoc::AbstractDoc;
pub use page::{Page, PageIter};
pub use revision::Revision;
// use bzip2::read::BzDecoder;
#[cfg(test)]
mod tests {
use quick_xml::reader::Reader;
use quick_xml::events::Event;
use super::*;
use bincode::{serialize, Infinite};
use std::str;
#[test]
fn enwp_page() {
let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap();
let pages = PageIter::new(xml);
for page in pages {
//if page.namespace == 10 {
// println!("{}", page.title);
//}
//assert!(page.namespace < 10000);
println!("{}\n(~END~)", page.revision.text);
}
}
#[test]
#[ignore]
fn enwp_abstract() {
let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap();
let mut buf = Vec::new();
let mut collect = Vec::new();
loop {
match xml.read_event(&mut buf) {
Ok(Event::Start(ref e)) => {
match e.name() {
b"doc" => {
match AbstractDoc::from_xml(&mut xml) {
Ok(abs) => {
//println!("{:#?}", abs);
assert!(abs.title != "");
assert!(abs.url != "");
collect.push(abs);
//assert!(abs.abstract_text != "");
//assert!(abs.links.len() > 0);
},
Err(err) => panic!("Abstract XML: {}", err),
}
},
_ => (),
}
},
Ok(Event::Eof) => break,
Err(err) => panic!("parse error {}", err),
_ => (),
}
}
let encoded: Vec<u8> = serialize(&collect, Infinite).unwrap();
println!("Encoded size: {}", encoded.len());
}
}
|