From 30bd9159921998288c623b0e7e357830c5d62bfb Mon Sep 17 00:00:00 2001 From: Nick Shipp Date: Sat, 27 May 2017 23:27:57 -0400 Subject: WIP: Wrote partial lifter from mediawiki dumps --- .gitignore | 5 +++ Cargo.toml | 12 ++++++ src/abstractdoc.rs | 96 +++++++++++++++++++++++++++++++++++++++++++ src/errors.rs | 28 +++++++++++++ src/lib.rs | 77 ++++++++++++++++++++++++++++++++++ src/page.rs | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/revision.rs | 96 +++++++++++++++++++++++++++++++++++++++++++ src/xmlutil.rs | 30 ++++++++++++++ 8 files changed, 462 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 src/abstractdoc.rs create mode 100644 src/errors.rs create mode 100644 src/lib.rs create mode 100644 src/page.rs create mode 100644 src/revision.rs create mode 100644 src/xmlutil.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..92aeac1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +target/ +**/*.rs.bk +Cargo.lock +.*.un~ +.*.sw* diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..c19db61 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "wikiparse" +version = "0.1.0" +authors = ["Nick Shipp "] + +[dependencies] +bzip2 = "^0.3" +quick-xml = "^0.7" +error-chain = "^0.10" +serde = "^1.0" +serde_derive = "^1.0" +bincode = "^0.8" diff --git a/src/abstractdoc.rs b/src/abstractdoc.rs new file mode 100644 index 0000000..d45b4fc --- /dev/null +++ b/src/abstractdoc.rs @@ -0,0 +1,96 @@ +use std::io::BufRead; + +use quick_xml::events::Event; +use quick_xml::reader::Reader; + +use errors::{Error, ErrorKind}; +use xmlutil::{FromXml, element_text}; + +#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)] +pub struct AbstractLink { + pub anchor: String, + pub link: String, +} + +impl FromXml for AbstractLink { + fn from_xml(reader: &mut Reader) -> Result { + let mut link = AbstractLink::default(); + let mut buf = Vec::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"anchor" => { + let res = element_text(reader); + link.anchor = res?; + }, + b"link" => { + let res = element_text(reader); + link.link = res?; + }, + _ => (), + } + }, + Ok(Event::End(_)) => { + return Ok(link) + }, + Err(err) => bail!(err), + _ => (), + } + } + } +} + +#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)] +pub struct AbstractDoc { + pub title: String, + pub url: String, + pub abstract_text: String, + pub links: Vec, +} + +impl FromXml for AbstractDoc { + fn from_xml(reader: &mut Reader) -> Result { + let mut abs = AbstractDoc::default(); + let mut buf = Vec::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"title" => { + let res = element_text(reader); + abs.title = res?; + }, + b"url" => { + let res = element_text(reader); + abs.url = res?; + }, + b"abstract" => { + let res = element_text(reader); + abs.abstract_text = res?; + }, + b"sublink" => { + match AbstractLink::from_xml(reader) { + Ok(link) => { + abs.links.push(link); + }, + Err(err) => bail!(err), + } + }, + _ => {}, + } + }, + Ok(Event::End(_)) => { + return Ok(abs); + }, + Ok(Event::Eof) => break, + Err(err) => bail!(err), + _ => (), + } + } + + bail!(ErrorKind::EOF) + } +} diff --git a/src/errors.rs b/src/errors.rs new file mode 100644 index 0000000..2704bd9 --- /dev/null +++ b/src/errors.rs @@ -0,0 +1,28 @@ +use quick_xml; + +error_chain! { + foreign_links { + Io(::std::io::Error); + XML(quick_xml::errors::Error); + NumParse(::std::num::ParseIntError); + Utf8(::std::str::Utf8Error); + } + errors { + FromXml(pos: usize, tag: String) { + description("invalid input while processing tag") + display("error at pos {}: invalid <{}>", pos, tag) + } + ElementText(pos: usize) { + description("found something other than text") + display("error at pos {}: not text", pos) + } + ClosingTag(pos: usize) { + description("found something other than a closing tag") + display("error at pos {}: expected a closing tag", pos) + } + EOF { + description("premature EOF") + display("expected more things") + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c836159 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,77 @@ +#[macro_use] +extern crate serde_derive; +extern crate bincode; +#[macro_use] +extern crate error_chain; +extern crate quick_xml; +extern crate bzip2; + +mod page; +mod abstractdoc; +mod xmlutil; +mod errors; +mod revision; + +pub use xmlutil::FromXml; +pub use abstractdoc::AbstractDoc; +pub use page::{Page, PageIter}; +pub use revision::Revision; + +// use bzip2::read::BzDecoder; + +#[cfg(test)] +mod tests { + use quick_xml::reader::Reader; + use quick_xml::events::Event; + use super::*; + use bincode::{serialize, Infinite}; + use std::str; + + #[test] + fn enwp_page() { + let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap(); + let pages = PageIter::new(xml); + + for page in pages { + assert!(page.namespace < 10000); + //println!("{:#?}", page); + } + } + + #[test] + #[ignore] + fn enwp_abstract() { + let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap(); + let mut buf = Vec::new(); + let mut collect = Vec::new(); + + loop { + match xml.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"doc" => { + match AbstractDoc::from_xml(&mut xml) { + Ok(abs) => { + //println!("{:#?}", abs); + assert!(abs.title != ""); + assert!(abs.url != ""); + collect.push(abs); + //assert!(abs.abstract_text != ""); + //assert!(abs.links.len() > 0); + }, + Err(err) => panic!("Abstract XML: {}", err), + } + }, + _ => (), + } + }, + Ok(Event::Eof) => break, + Err(err) => panic!("parse error {}", err), + _ => (), + } + } + + let encoded: Vec = serialize(&collect, Infinite).unwrap(); + println!("Encoded size: {}", encoded.len()); + } +} diff --git a/src/page.rs b/src/page.rs new file mode 100644 index 0000000..b864640 --- /dev/null +++ b/src/page.rs @@ -0,0 +1,118 @@ +use std::io::BufRead; +use std::str; + +use quick_xml::reader::Reader; +use quick_xml::events::Event; + +use errors::Result; +use xmlutil::{FromXml, element_text}; +use revision::Revision; + +#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)] +pub struct Page { + pub namespace: i32, + pub title: String, + pub redirect: Option, + pub revision: Revision, +} + +impl FromXml for Page { + fn from_xml(reader: &mut Reader) -> Result { + let mut page = Page::default(); + let mut buf = Vec::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"title" => { + let res = element_text(reader); + page.title = res?; + }, + b"namespace" => { + let res = element_text(reader); + page.namespace = res?.parse()?; + }, + b"revision" => { + match Revision::from_xml(reader) { + Ok(rev) => { + page.revision = rev; + }, + Err(err) => bail!(err), + } + }, + _ => (), + } + }, + Ok(Event::Empty(e)) => { + match e.name() { + b"redirect" => { + for attr in e.attributes().with_checks(false) { + if let Ok(attr) = attr { + if attr.key == b"title" { + page.redirect = Some(str::from_utf8(attr.value)?.into()); + } + } + } + }, + _ => (), + } + }, + Ok(Event::End(e)) => { + if e.name() == b"page" { + return Ok(page) + } + }, + Err(err) => bail!(err), + _ => (), + } + } + } +} + +pub struct PageIter { + reader: Reader, + buf: Vec, +} + +impl Iterator for PageIter { + type Item = Page; + + fn next(&mut self) -> Option { + loop { + match self.reader.read_event(&mut self.buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"page" => { + match Page::from_xml(&mut self.reader) { + Ok(page) => { + return Some(page) + }, + Err(err) => { + println!("parse error: {}", err); + return None + } + } + } + _ => (), + } + }, + Ok(Event::Eof) => return None, + Err(err) => { + println!("parse error: {}", err); + return None + } + _ => (), + } + } + } +} + +impl PageIter { + pub fn new(reader: Reader) -> Self { + PageIter { + reader, + buf: Vec::new(), + } + } +} diff --git a/src/revision.rs b/src/revision.rs new file mode 100644 index 0000000..5d47f1a --- /dev/null +++ b/src/revision.rs @@ -0,0 +1,96 @@ +use std::io::BufRead; + +use quick_xml::events::Event; +use quick_xml::reader::Reader; + +use errors::Result; +use xmlutil::{FromXml, element_text}; + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum RevisionModel { + Css, + Javascript, + Json, + MassMessageListContent, + Scribunto, + Wikitext, + Unknown(String), +} + +impl Default for RevisionModel { + fn default() -> RevisionModel { RevisionModel::Unknown(String::from("?")) } +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum RevisionFormat { + Json, + Css, + Javascript, + Plain, + Wiki, + Unknown(String), +} + +impl Default for RevisionFormat { + fn default() -> RevisionFormat { RevisionFormat::Unknown(String::from("?")) } +} + +#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)] +pub struct Revision { + pub model: RevisionModel, + pub format: RevisionFormat, + pub text: String, +} + +impl FromXml for Revision { + fn from_xml(reader: &mut Reader) -> Result { + let mut rev = Revision::default(); + let mut buf = Vec::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + match e.name() { + b"model" => { + let res = element_text(reader); + let model = res?; + rev.model = match model.as_ref() { + "css" => RevisionModel::Css, + "javascript" => RevisionModel::Javascript, + "json" => RevisionModel::Json, + "MassMessageListContent" => RevisionModel::MassMessageListContent, + "Scribunto" => RevisionModel::Scribunto, + "wikitext" => RevisionModel::Wikitext, + _ => RevisionModel::Unknown(model), + }; + }, + b"format" => { + let res = element_text(reader); + let format = res?; + rev.format = match format.as_ref() { + "application/json" => RevisionFormat::Json, + "text/css" => RevisionFormat::Css, + "text/javascript" => RevisionFormat::Javascript, + "text/plain" => RevisionFormat::Plain, + "text/x-wiki" => RevisionFormat::Wiki, + _ => RevisionFormat::Unknown(format), + }; + }, + b"text" => { + let res = element_text(reader); + rev.text = res?; + }, + _ => (), + } + } + Ok(Event::End(e)) => { + if e.name() == b"revision" { + return Ok(rev) + } + }, + Err(err) => bail!(err), + _ => (), + } + } + } +} diff --git a/src/xmlutil.rs b/src/xmlutil.rs new file mode 100644 index 0000000..a0cebaf --- /dev/null +++ b/src/xmlutil.rs @@ -0,0 +1,30 @@ +use std::io::BufRead; + +use quick_xml::reader::Reader; +use quick_xml::events::Event; +use errors::{ErrorKind, Result}; + +pub fn element_text(reader: &mut Reader) -> Result { + let content: Result; + let mut buf = Vec::new(); + + if let Ok(Event::Text(e)) = reader.read_event(&mut buf) { + content = match e.unescape_and_decode(&reader) { + Ok(s) => Ok(s), + Err(e) => Err(e.into()), + } + } else { + return Err(ErrorKind::ElementText(reader.buffer_position()).into()) + } + + match reader.read_event(&mut buf) { + Ok(Event::End(_)) => (), + _ => return Err(ErrorKind::ClosingTag(reader.buffer_position()).into()) + } + + content +} + +pub trait FromXml: Sized { + fn from_xml(reader: &mut Reader) -> Result; +} -- cgit v1.2.3-54-g00ecf