From 3f2480a08cc7335dda1c50af7f018a5a4c46d49d Mon Sep 17 00:00:00 2001 From: Nick Shipp Date: Fri, 2 Jun 2017 22:49:08 -0400 Subject: Fix namespace tag name --- src/lib.rs | 7 +++-- src/page.rs | 2 +- tools/splitwords.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 tools/splitwords.c diff --git a/src/lib.rs b/src/lib.rs index c836159..593ad27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,8 +33,11 @@ mod tests { let pages = PageIter::new(xml); for page in pages { - assert!(page.namespace < 10000); - //println!("{:#?}", page); + //if page.namespace == 10 { + // println!("{}", page.title); + //} + //assert!(page.namespace < 10000); + println!("{}\n(~END~)", page.revision.text); } } diff --git a/src/page.rs b/src/page.rs index b864640..bd65601 100644 --- a/src/page.rs +++ b/src/page.rs @@ -29,7 +29,7 @@ impl FromXml for Page { let res = element_text(reader); page.title = res?; }, - b"namespace" => { + b"ns" => { let res = element_text(reader); page.namespace = res?.parse()?; }, diff --git a/tools/splitwords.c b/tools/splitwords.c new file mode 100644 index 0000000..5cf7f04 --- /dev/null +++ b/tools/splitwords.c @@ -0,0 +1,90 @@ +/* + * Beautifully overcomplicated version of `sed -i 's/ /\n/g` + * + * This uses AVX2 to find and replace spaces in 32byte chunks, + * because the SSE2 memchr implementation in glibc is too slow. + * It is not at all UTF8-aware. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PATH "/tmp/wikitext" + +int +main(void) +{ + char *mem; + int fd; + struct stat st; + size_t bytes; + __m256i *cur, *end; + __m256i ymm0, ymm1, ymm2, ymm3, ymm4; + uint64_t it; + + if (stat(PATH, &st) == -1) { + perror("stat"); + exit(-1); + } + + fd = open(PATH, O_RDWR); + if (fd == -1) { + perror("open " PATH); + } + + mem = (char *)mmap(0, st.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + + if (mem == MAP_FAILED) { + perror("mmap"); + exit(-1); + } + + madvise(mem, st.st_size, MADV_SEQUENTIAL|MADV_WILLNEED); + + end = (__m256i *)(mem + st.st_size); + + /* + * ymm2 = 32x 0010 0000 + * ymm3 = 32x 0000 1010 + */ + ymm2 = _mm256_set1_epi8(' '); + ymm3 = _mm256_set1_epi8('\n'); + + /* + * In 32-byte chunks, + * - Load from mmapped file + * - Look for spaces, get a boolean byte mask + * - 'Blend' original vector with a vector of newlines, switching on the byte mask + * ... which turns spaces into newlines. + */ + it = __rdtsc(); + for (cur=(__m256i *)mem;cur < end;cur++) { + ymm0 = _mm256_load_si256(cur); + ymm1 = _mm256_cmpeq_epi8(ymm0, ymm2); + ymm4 = _mm256_blendv_epi8(ymm0, ymm3, ymm1); + _mm256_store_si256(cur, ymm4); + } + printf("%10.6f cycles/byte\n", + (double)(__rdtsc() - it) / st.st_size); + + if (msync(mem, st.st_size, MS_SYNC) == -1) { + perror("msync"); + } + + if (fsync(fd) == -1) { + perror("fsync"); + } + + if (munmap(mem, st.st_size) == -1) { + perror("munmap"); + } + + close(fd); +} -- cgit v1.2.3-54-g00ecf