aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Shipp <nick@shipp.ninja>2017-06-02 22:49:08 -0400
committerNick Shipp <nick@shipp.ninja>2017-06-02 22:49:08 -0400
commit3f2480a08cc7335dda1c50af7f018a5a4c46d49d (patch)
tree83850c2fa734d6cda164538bfbfec92bc9abc56d
parent30bd9159921998288c623b0e7e357830c5d62bfb (diff)
Fix namespace tag name
-rw-r--r--src/lib.rs7
-rw-r--r--src/page.rs2
-rw-r--r--tools/splitwords.c90
3 files changed, 96 insertions, 3 deletions
diff --git a/src/lib.rs b/src/lib.rs
index c836159..593ad27 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,8 +33,11 @@ mod tests {
let pages = PageIter::new(xml);
for page in pages {
- assert!(page.namespace < 10000);
- //println!("{:#?}", page);
+ //if page.namespace == 10 {
+ // println!("{}", page.title);
+ //}
+ //assert!(page.namespace < 10000);
+ println!("{}\n(~END~)", page.revision.text);
}
}
diff --git a/src/page.rs b/src/page.rs
index b864640..bd65601 100644
--- a/src/page.rs
+++ b/src/page.rs
@@ -29,7 +29,7 @@ impl FromXml for Page {
let res = element_text(reader);
page.title = res?;
},
- b"namespace" => {
+ b"ns" => {
let res = element_text(reader);
page.namespace = res?.parse()?;
},
diff --git a/tools/splitwords.c b/tools/splitwords.c
new file mode 100644
index 0000000..5cf7f04
--- /dev/null
+++ b/tools/splitwords.c
@@ -0,0 +1,90 @@
+/*
+ * Beautifully overcomplicated version of `sed -i 's/ /\n/g`
+ *
+ * This uses AVX2 to find and replace spaces in 32byte chunks,
+ * because the SSE2 memchr implementation in glibc is too slow.
+ * It is not at all UTF8-aware.
+ */
+#include <stdint.h>
+#include <unistd.h>
+#include <immintrin.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define PATH "/tmp/wikitext"
+
+int
+main(void)
+{
+ char *mem;
+ int fd;
+ struct stat st;
+ size_t bytes;
+ __m256i *cur, *end;
+ __m256i ymm0, ymm1, ymm2, ymm3, ymm4;
+ uint64_t it;
+
+ if (stat(PATH, &st) == -1) {
+ perror("stat");
+ exit(-1);
+ }
+
+ fd = open(PATH, O_RDWR);
+ if (fd == -1) {
+ perror("open " PATH);
+ }
+
+ mem = (char *)mmap(0, st.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+
+ if (mem == MAP_FAILED) {
+ perror("mmap");
+ exit(-1);
+ }
+
+ madvise(mem, st.st_size, MADV_SEQUENTIAL|MADV_WILLNEED);
+
+ end = (__m256i *)(mem + st.st_size);
+
+ /*
+ * ymm2 = 32x 0010 0000
+ * ymm3 = 32x 0000 1010
+ */
+ ymm2 = _mm256_set1_epi8(' ');
+ ymm3 = _mm256_set1_epi8('\n');
+
+ /*
+ * In 32-byte chunks,
+ * - Load from mmapped file
+ * - Look for spaces, get a boolean byte mask
+ * - 'Blend' original vector with a vector of newlines, switching on the byte mask
+ * ... which turns spaces into newlines.
+ */
+ it = __rdtsc();
+ for (cur=(__m256i *)mem;cur < end;cur++) {
+ ymm0 = _mm256_load_si256(cur);
+ ymm1 = _mm256_cmpeq_epi8(ymm0, ymm2);
+ ymm4 = _mm256_blendv_epi8(ymm0, ymm3, ymm1);
+ _mm256_store_si256(cur, ymm4);
+ }
+ printf("%10.6f cycles/byte\n",
+ (double)(__rdtsc() - it) / st.st_size);
+
+ if (msync(mem, st.st_size, MS_SYNC) == -1) {
+ perror("msync");
+ }
+
+ if (fsync(fd) == -1) {
+ perror("fsync");
+ }
+
+ if (munmap(mem, st.st_size) == -1) {
+ perror("munmap");
+ }
+
+ close(fd);
+}