aboutsummaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/splitwords.c90
1 files changed, 90 insertions, 0 deletions
diff --git a/tools/splitwords.c b/tools/splitwords.c
new file mode 100644
index 0000000..5cf7f04
--- /dev/null
+++ b/tools/splitwords.c
@@ -0,0 +1,90 @@
+/*
+ * Beautifully overcomplicated version of `sed -i 's/ /\n/g`
+ *
+ * This uses AVX2 to find and replace spaces in 32byte chunks,
+ * because the SSE2 memchr implementation in glibc is too slow.
+ * It is not at all UTF8-aware.
+ */
+#include <stdint.h>
+#include <unistd.h>
+#include <immintrin.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define PATH "/tmp/wikitext"
+
+int
+main(void)
+{
+ char *mem;
+ int fd;
+ struct stat st;
+ size_t bytes;
+ __m256i *cur, *end;
+ __m256i ymm0, ymm1, ymm2, ymm3, ymm4;
+ uint64_t it;
+
+ if (stat(PATH, &st) == -1) {
+ perror("stat");
+ exit(-1);
+ }
+
+ fd = open(PATH, O_RDWR);
+ if (fd == -1) {
+ perror("open " PATH);
+ }
+
+ mem = (char *)mmap(0, st.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+
+ if (mem == MAP_FAILED) {
+ perror("mmap");
+ exit(-1);
+ }
+
+ madvise(mem, st.st_size, MADV_SEQUENTIAL|MADV_WILLNEED);
+
+ end = (__m256i *)(mem + st.st_size);
+
+ /*
+ * ymm2 = 32x 0010 0000
+ * ymm3 = 32x 0000 1010
+ */
+ ymm2 = _mm256_set1_epi8(' ');
+ ymm3 = _mm256_set1_epi8('\n');
+
+ /*
+ * In 32-byte chunks,
+ * - Load from mmapped file
+ * - Look for spaces, get a boolean byte mask
+ * - 'Blend' original vector with a vector of newlines, switching on the byte mask
+ * ... which turns spaces into newlines.
+ */
+ it = __rdtsc();
+ for (cur=(__m256i *)mem;cur < end;cur++) {
+ ymm0 = _mm256_load_si256(cur);
+ ymm1 = _mm256_cmpeq_epi8(ymm0, ymm2);
+ ymm4 = _mm256_blendv_epi8(ymm0, ymm3, ymm1);
+ _mm256_store_si256(cur, ymm4);
+ }
+ printf("%10.6f cycles/byte\n",
+ (double)(__rdtsc() - it) / st.st_size);
+
+ if (msync(mem, st.st_size, MS_SYNC) == -1) {
+ perror("msync");
+ }
+
+ if (fsync(fd) == -1) {
+ perror("fsync");
+ }
+
+ if (munmap(mem, st.st_size) == -1) {
+ perror("munmap");
+ }
+
+ close(fd);
+}