summaryrefslogtreecommitdiff
path: root/convert-to-plain.sh
blob: e7bd378520c97b76d4e4c3db3b38a2fe218fdd23 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash

# Compile flex program to regex-convert the XML to plain text
#https://unix.stackexchange.com/a/413684
flex -o links-xml-to-plain.c links-xml-to-plain.l
gcc -O3  -o links-xml-to-plain links-xml-to-plain.c -lfl

# Convert to plain text
lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt

# Extract titles
grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt



gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c
time ./links-plain-to-binary

gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c
time ./links-outgoing-to-incoming