diff options
-rw-r--r-- | .gitignore | 6 | ||||
-rw-r--r-- | Makefile | 61 | ||||
-rw-r--r-- | convert-to-plain.sh | 20 | ||||
-rw-r--r-- | src/links-outgoing-to-incoming.c (renamed from links-outgoing-to-incoming.c) | 0 | ||||
-rw-r--r-- | src/links-plain-to-binary.c (renamed from links-plain-to-binary.c) | 0 | ||||
-rw-r--r-- | src/links-xml-to-plain.l (renamed from links-xml-to-plain.l) | 0 | ||||
-rw-r--r-- | src/lookup-incoming.c (renamed from lookup-incoming.c) | 0 |
7 files changed, 67 insertions, 20 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..20bdc9e --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +bin/ +enwiki-links-plain.txt +titles.txt +titles-sorted.txt +links-outgoing.bin +links-incoming.bin diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0182eff --- /dev/null +++ b/Makefile @@ -0,0 +1,61 @@ +GCC=gcc -O3 -g +BINDIR=bin +SRCDIR=src + +all: lookuptools preptools dbfiles + + +$(BINDIR): + mkdir -p $(BINDIR) + + +lookuptools: $(BINDIR)/lookup-incoming + +$(BINDIR)/lookup-incoming: $(SRCDIR)/lookup-incoming.c + mkdir -p $(BINDIR) + $(GCC) -o $@ $< + + + +preptools: $(BINDIR)/links-xml-to-plain \ + $(BINDIR)/links-outgoing-to-incoming \ + $(BINDIR)/links-plain-to-binary + +$(BINDIR)/links-xml-to-plain: $(SRCDIR)/links-xml-to-plain.l + mkdir -p $(BINDIR) + # Compile flex program to regex-convert the XML to plain text + #https://unix.stackexchange.com/a/413684 + flex -o $(BINDIR)/links-xml-to-plain.c $^ + $(GCC) -o $@ $(BINDIR)/links-xml-to-plain.c -lfl + +$(BINDIR)/links-plain-to-binary: $(SRCDIR)/links-plain-to-binary.c titles.txt + mkdir -p $(BINDIR) + $(GCC) -o $@ $< + +$(BINDIR)/links-outgoing-to-incoming: $(SRCDIR)/links-outgoing-to-incoming.c + mkdir -p $(BINDIR) + $(GCC) -o $@ $< + + + +dbfiles: links-incoming.bin titles-sorted.txt + +enwiki-links-plain.txt: $(BINDIR)/links-xml-to-plain enwiki-links.xml.lzo + # Convert to plain text + lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | $(BINDIR)/links-xml-to-plain > enwiki-links-plain.txt + +titles.txt: enwiki-links-plain.txt + # Extract titles + grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt + +links-outgoing.bin: $(BINDIR)/links-plain-to-binary titles.txt + # This also produces titles-sorted.txt + $(BINDIR)/links-plain-to-binary + # Update timestamp so make does not rebuild links-outgoing.bin + touch titles-sorted.txt + +titles-sorted.txt: links-outgoing.bin + # Generated in the same step as links-outgoing.bin + +links-incoming.bin: $(BINDIR)/links-outgoing-to-incoming links-outgoing.bin + $(BINDIR)/links-outgoing-to-incoming diff --git a/convert-to-plain.sh b/convert-to-plain.sh deleted file mode 100644 index e7bd378..0000000 --- a/convert-to-plain.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Compile flex program to regex-convert the XML to plain text -#https://unix.stackexchange.com/a/413684 -flex -o links-xml-to-plain.c links-xml-to-plain.l -gcc -O3 -o links-xml-to-plain links-xml-to-plain.c -lfl - -# Convert to plain text -lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt - -# Extract titles -grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt - - - -gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c -time ./links-plain-to-binary - -gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c -time ./links-outgoing-to-incoming diff --git a/links-outgoing-to-incoming.c b/src/links-outgoing-to-incoming.c index 2fd853a..2fd853a 100644 --- a/links-outgoing-to-incoming.c +++ b/src/links-outgoing-to-incoming.c diff --git a/links-plain-to-binary.c b/src/links-plain-to-binary.c index 005a496..005a496 100644 --- a/links-plain-to-binary.c +++ b/src/links-plain-to-binary.c diff --git a/links-xml-to-plain.l b/src/links-xml-to-plain.l index dabe538..dabe538 100644 --- a/links-xml-to-plain.l +++ b/src/links-xml-to-plain.l diff --git a/lookup-incoming.c b/src/lookup-incoming.c index 2d3fa9c..2d3fa9c 100644 --- a/lookup-incoming.c +++ b/src/lookup-incoming.c |