summaryrefslogtreecommitdiff
path: root/convert-to-plain.sh
diff options
context:
space:
mode:
Diffstat (limited to 'convert-to-plain.sh')
-rw-r--r--convert-to-plain.sh20
1 files changed, 20 insertions, 0 deletions
diff --git a/convert-to-plain.sh b/convert-to-plain.sh
new file mode 100644
index 0000000..e7bd378
--- /dev/null
+++ b/convert-to-plain.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Compile flex program to regex-convert the XML to plain text
+#https://unix.stackexchange.com/a/413684
+flex -o links-xml-to-plain.c links-xml-to-plain.l
+gcc -O3 -o links-xml-to-plain links-xml-to-plain.c -lfl
+
+# Convert to plain text
+lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt
+
+# Extract titles
+grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt
+
+
+
+gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c
+time ./links-plain-to-binary
+
+gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c
+time ./links-outgoing-to-incoming