aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Shipp <nick@shipp.ninja>2017-05-27 23:27:57 -0400
committerNick Shipp <nick@shipp.ninja>2017-05-27 23:27:57 -0400
commit30bd9159921998288c623b0e7e357830c5d62bfb (patch)
treeb52e4cd6ed108efd5e58e73069375fcaee069ce2
WIP: Wrote partial lifter from mediawiki dumps
-rw-r--r--.gitignore5
-rw-r--r--Cargo.toml12
-rw-r--r--src/abstractdoc.rs96
-rw-r--r--src/errors.rs28
-rw-r--r--src/lib.rs77
-rw-r--r--src/page.rs118
-rw-r--r--src/revision.rs96
-rw-r--r--src/xmlutil.rs30
8 files changed, 462 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..92aeac1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+target/
+**/*.rs.bk
+Cargo.lock
+.*.un~
+.*.sw*
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..c19db61
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "wikiparse"
+version = "0.1.0"
+authors = ["Nick Shipp <nick@shipp.ninja>"]
+
+[dependencies]
+bzip2 = "^0.3"
+quick-xml = "^0.7"
+error-chain = "^0.10"
+serde = "^1.0"
+serde_derive = "^1.0"
+bincode = "^0.8"
diff --git a/src/abstractdoc.rs b/src/abstractdoc.rs
new file mode 100644
index 0000000..d45b4fc
--- /dev/null
+++ b/src/abstractdoc.rs
@@ -0,0 +1,96 @@
+use std::io::BufRead;
+
+use quick_xml::events::Event;
+use quick_xml::reader::Reader;
+
+use errors::{Error, ErrorKind};
+use xmlutil::{FromXml, element_text};
+
+#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
+pub struct AbstractLink {
+ pub anchor: String,
+ pub link: String,
+}
+
+impl FromXml for AbstractLink {
+ fn from_xml<B: BufRead>(reader: &mut Reader<B>) -> Result<Self, Error> {
+ let mut link = AbstractLink::default();
+ let mut buf = Vec::new();
+
+ loop {
+ match reader.read_event(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"anchor" => {
+ let res = element_text(reader);
+ link.anchor = res?;
+ },
+ b"link" => {
+ let res = element_text(reader);
+ link.link = res?;
+ },
+ _ => (),
+ }
+ },
+ Ok(Event::End(_)) => {
+ return Ok(link)
+ },
+ Err(err) => bail!(err),
+ _ => (),
+ }
+ }
+ }
+}
+
+#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
+pub struct AbstractDoc {
+ pub title: String,
+ pub url: String,
+ pub abstract_text: String,
+ pub links: Vec<AbstractLink>,
+}
+
+impl FromXml for AbstractDoc {
+ fn from_xml<B: BufRead>(reader: &mut Reader<B>) -> Result<Self, Error> {
+ let mut abs = AbstractDoc::default();
+ let mut buf = Vec::new();
+
+ loop {
+ match reader.read_event(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"title" => {
+ let res = element_text(reader);
+ abs.title = res?;
+ },
+ b"url" => {
+ let res = element_text(reader);
+ abs.url = res?;
+ },
+ b"abstract" => {
+ let res = element_text(reader);
+ abs.abstract_text = res?;
+ },
+ b"sublink" => {
+ match AbstractLink::from_xml(reader) {
+ Ok(link) => {
+ abs.links.push(link);
+ },
+ Err(err) => bail!(err),
+ }
+ },
+ _ => {},
+ }
+ },
+ Ok(Event::End(_)) => {
+ return Ok(abs);
+ },
+ Ok(Event::Eof) => break,
+ Err(err) => bail!(err),
+ _ => (),
+ }
+ }
+
+ bail!(ErrorKind::EOF)
+ }
+}
diff --git a/src/errors.rs b/src/errors.rs
new file mode 100644
index 0000000..2704bd9
--- /dev/null
+++ b/src/errors.rs
@@ -0,0 +1,28 @@
+use quick_xml;
+
+error_chain! {
+ foreign_links {
+ Io(::std::io::Error);
+ XML(quick_xml::errors::Error);
+ NumParse(::std::num::ParseIntError);
+ Utf8(::std::str::Utf8Error);
+ }
+ errors {
+ FromXml(pos: usize, tag: String) {
+ description("invalid input while processing tag")
+ display("error at pos {}: invalid <{}>", pos, tag)
+ }
+ ElementText(pos: usize) {
+ description("found something other than text")
+ display("error at pos {}: not text", pos)
+ }
+ ClosingTag(pos: usize) {
+ description("found something other than a closing tag")
+ display("error at pos {}: expected a closing tag", pos)
+ }
+ EOF {
+ description("premature EOF")
+ display("expected more things")
+ }
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..c836159
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,77 @@
+#[macro_use]
+extern crate serde_derive;
+extern crate bincode;
+#[macro_use]
+extern crate error_chain;
+extern crate quick_xml;
+extern crate bzip2;
+
+mod page;
+mod abstractdoc;
+mod xmlutil;
+mod errors;
+mod revision;
+
+pub use xmlutil::FromXml;
+pub use abstractdoc::AbstractDoc;
+pub use page::{Page, PageIter};
+pub use revision::Revision;
+
+// use bzip2::read::BzDecoder;
+
+#[cfg(test)]
+mod tests {
+ use quick_xml::reader::Reader;
+ use quick_xml::events::Event;
+ use super::*;
+ use bincode::{serialize, Infinite};
+ use std::str;
+
+ #[test]
+ fn enwp_page() {
+ let xml = Reader::from_file("/fast/scratch/enwiki-20170520-pages-articles-multistream.xml").unwrap();
+ let pages = PageIter::new(xml);
+
+ for page in pages {
+ assert!(page.namespace < 10000);
+ //println!("{:#?}", page);
+ }
+ }
+
+ #[test]
+ #[ignore]
+ fn enwp_abstract() {
+ let mut xml = Reader::from_file("/tmp/enwiki-20170520-abstract.xml").unwrap();
+ let mut buf = Vec::new();
+ let mut collect = Vec::new();
+
+ loop {
+ match xml.read_event(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"doc" => {
+ match AbstractDoc::from_xml(&mut xml) {
+ Ok(abs) => {
+ //println!("{:#?}", abs);
+ assert!(abs.title != "");
+ assert!(abs.url != "");
+ collect.push(abs);
+ //assert!(abs.abstract_text != "");
+ //assert!(abs.links.len() > 0);
+ },
+ Err(err) => panic!("Abstract XML: {}", err),
+ }
+ },
+ _ => (),
+ }
+ },
+ Ok(Event::Eof) => break,
+ Err(err) => panic!("parse error {}", err),
+ _ => (),
+ }
+ }
+
+ let encoded: Vec<u8> = serialize(&collect, Infinite).unwrap();
+ println!("Encoded size: {}", encoded.len());
+ }
+}
diff --git a/src/page.rs b/src/page.rs
new file mode 100644
index 0000000..b864640
--- /dev/null
+++ b/src/page.rs
@@ -0,0 +1,118 @@
+use std::io::BufRead;
+use std::str;
+
+use quick_xml::reader::Reader;
+use quick_xml::events::Event;
+
+use errors::Result;
+use xmlutil::{FromXml, element_text};
+use revision::Revision;
+
+#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
+pub struct Page {
+ pub namespace: i32,
+ pub title: String,
+ pub redirect: Option<String>,
+ pub revision: Revision,
+}
+
+impl FromXml for Page {
+ fn from_xml<B: BufRead>(reader: &mut Reader<B>) -> Result<Self> {
+ let mut page = Page::default();
+ let mut buf = Vec::new();
+
+ loop {
+ match reader.read_event(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"title" => {
+ let res = element_text(reader);
+ page.title = res?;
+ },
+ b"namespace" => {
+ let res = element_text(reader);
+ page.namespace = res?.parse()?;
+ },
+ b"revision" => {
+ match Revision::from_xml(reader) {
+ Ok(rev) => {
+ page.revision = rev;
+ },
+ Err(err) => bail!(err),
+ }
+ },
+ _ => (),
+ }
+ },
+ Ok(Event::Empty(e)) => {
+ match e.name() {
+ b"redirect" => {
+ for attr in e.attributes().with_checks(false) {
+ if let Ok(attr) = attr {
+ if attr.key == b"title" {
+ page.redirect = Some(str::from_utf8(attr.value)?.into());
+ }
+ }
+ }
+ },
+ _ => (),
+ }
+ },
+ Ok(Event::End(e)) => {
+ if e.name() == b"page" {
+ return Ok(page)
+ }
+ },
+ Err(err) => bail!(err),
+ _ => (),
+ }
+ }
+ }
+}
+
+pub struct PageIter<B: BufRead> {
+ reader: Reader<B>,
+ buf: Vec<u8>,
+}
+
+impl<B: BufRead> Iterator for PageIter<B> {
+ type Item = Page;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ match self.reader.read_event(&mut self.buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"page" => {
+ match Page::from_xml(&mut self.reader) {
+ Ok(page) => {
+ return Some(page)
+ },
+ Err(err) => {
+ println!("parse error: {}", err);
+ return None
+ }
+ }
+ }
+ _ => (),
+ }
+ },
+ Ok(Event::Eof) => return None,
+ Err(err) => {
+ println!("parse error: {}", err);
+ return None
+ }
+ _ => (),
+ }
+ }
+ }
+}
+
+impl<B: BufRead> PageIter<B> {
+ pub fn new(reader: Reader<B>) -> Self {
+ PageIter {
+ reader,
+ buf: Vec::new(),
+ }
+ }
+}
diff --git a/src/revision.rs b/src/revision.rs
new file mode 100644
index 0000000..5d47f1a
--- /dev/null
+++ b/src/revision.rs
@@ -0,0 +1,96 @@
+use std::io::BufRead;
+
+use quick_xml::events::Event;
+use quick_xml::reader::Reader;
+
+use errors::Result;
+use xmlutil::{FromXml, element_text};
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub enum RevisionModel {
+ Css,
+ Javascript,
+ Json,
+ MassMessageListContent,
+ Scribunto,
+ Wikitext,
+ Unknown(String),
+}
+
+impl Default for RevisionModel {
+ fn default() -> RevisionModel { RevisionModel::Unknown(String::from("?")) }
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub enum RevisionFormat {
+ Json,
+ Css,
+ Javascript,
+ Plain,
+ Wiki,
+ Unknown(String),
+}
+
+impl Default for RevisionFormat {
+ fn default() -> RevisionFormat { RevisionFormat::Unknown(String::from("?")) }
+}
+
+#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
+pub struct Revision {
+ pub model: RevisionModel,
+ pub format: RevisionFormat,
+ pub text: String,
+}
+
+impl FromXml for Revision {
+ fn from_xml<B: BufRead>(reader: &mut Reader<B>) -> Result<Self> {
+ let mut rev = Revision::default();
+ let mut buf = Vec::new();
+
+ loop {
+ match reader.read_event(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ match e.name() {
+ b"model" => {
+ let res = element_text(reader);
+ let model = res?;
+ rev.model = match model.as_ref() {
+ "css" => RevisionModel::Css,
+ "javascript" => RevisionModel::Javascript,
+ "json" => RevisionModel::Json,
+ "MassMessageListContent" => RevisionModel::MassMessageListContent,
+ "Scribunto" => RevisionModel::Scribunto,
+ "wikitext" => RevisionModel::Wikitext,
+ _ => RevisionModel::Unknown(model),
+ };
+ },
+ b"format" => {
+ let res = element_text(reader);
+ let format = res?;
+ rev.format = match format.as_ref() {
+ "application/json" => RevisionFormat::Json,
+ "text/css" => RevisionFormat::Css,
+ "text/javascript" => RevisionFormat::Javascript,
+ "text/plain" => RevisionFormat::Plain,
+ "text/x-wiki" => RevisionFormat::Wiki,
+ _ => RevisionFormat::Unknown(format),
+ };
+ },
+ b"text" => {
+ let res = element_text(reader);
+ rev.text = res?;
+ },
+ _ => (),
+ }
+ }
+ Ok(Event::End(e)) => {
+ if e.name() == b"revision" {
+ return Ok(rev)
+ }
+ },
+ Err(err) => bail!(err),
+ _ => (),
+ }
+ }
+ }
+}
diff --git a/src/xmlutil.rs b/src/xmlutil.rs
new file mode 100644
index 0000000..a0cebaf
--- /dev/null
+++ b/src/xmlutil.rs
@@ -0,0 +1,30 @@
+use std::io::BufRead;
+
+use quick_xml::reader::Reader;
+use quick_xml::events::Event;
+use errors::{ErrorKind, Result};
+
+pub fn element_text<B: BufRead>(reader: &mut Reader<B>) -> Result<String> {
+ let content: Result<String>;
+ let mut buf = Vec::new();
+
+ if let Ok(Event::Text(e)) = reader.read_event(&mut buf) {
+ content = match e.unescape_and_decode(&reader) {
+ Ok(s) => Ok(s),
+ Err(e) => Err(e.into()),
+ }
+ } else {
+ return Err(ErrorKind::ElementText(reader.buffer_position()).into())
+ }
+
+ match reader.read_event(&mut buf) {
+ Ok(Event::End(_)) => (),
+ _ => return Err(ErrorKind::ClosingTag(reader.buffer_position()).into())
+ }
+
+ content
+}
+
+pub trait FromXml: Sized {
+ fn from_xml<B: BufRead>(reader: &mut Reader<B>) -> Result<Self>;
+}