diff options
Diffstat (limited to 'convert-to-plain.sh')
-rw-r--r-- | convert-to-plain.sh | 20 |
1 files changed, 0 insertions, 20 deletions
diff --git a/convert-to-plain.sh b/convert-to-plain.sh deleted file mode 100644 index e7bd378..0000000 --- a/convert-to-plain.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Compile flex program to regex-convert the XML to plain text -#https://unix.stackexchange.com/a/413684 -flex -o links-xml-to-plain.c links-xml-to-plain.l -gcc -O3 -o links-xml-to-plain links-xml-to-plain.c -lfl - -# Convert to plain text -lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt - -# Extract titles -grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt - - - -gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c -time ./links-plain-to-binary - -gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c -time ./links-outgoing-to-incoming |