Move to Makefile and .gitignore
authornorly <ny-git@enpas.org>
Sun, 14 Jul 2019 15:28:47 +0000 (17:28 +0200)
committernorly <ny-git@enpas.org>
Sun, 14 Jul 2019 15:28:47 +0000 (17:28 +0200)
.gitignore [new file with mode: 0644]
Makefile [new file with mode: 0644]
convert-to-plain.sh [deleted file]
links-outgoing-to-incoming.c [deleted file]
links-plain-to-binary.c [deleted file]
links-xml-to-plain.l [deleted file]
lookup-incoming.c [deleted file]
src/links-outgoing-to-incoming.c [new file with mode: 0644]
src/links-plain-to-binary.c [new file with mode: 0644]
src/links-xml-to-plain.l [new file with mode: 0644]
src/lookup-incoming.c [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..20bdc9e
--- /dev/null
@@ -0,0 +1,6 @@
+bin/
+enwiki-links-plain.txt
+titles.txt
+titles-sorted.txt
+links-outgoing.bin
+links-incoming.bin
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..0182eff
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,61 @@
+GCC=gcc -O3 -g
+BINDIR=bin
+SRCDIR=src
+
+all: lookuptools preptools dbfiles
+
+
+$(BINDIR):
+       mkdir -p $(BINDIR)
+
+
+lookuptools: $(BINDIR)/lookup-incoming
+
+$(BINDIR)/lookup-incoming: $(SRCDIR)/lookup-incoming.c
+       mkdir -p $(BINDIR)
+       $(GCC) -o $@ $<
+
+
+
+preptools: $(BINDIR)/links-xml-to-plain \
+       $(BINDIR)/links-outgoing-to-incoming \
+       $(BINDIR)/links-plain-to-binary
+
+$(BINDIR)/links-xml-to-plain: $(SRCDIR)/links-xml-to-plain.l
+       mkdir -p $(BINDIR)
+       # Compile flex program to regex-convert the XML to plain text
+       #https://unix.stackexchange.com/a/413684
+       flex -o $(BINDIR)/links-xml-to-plain.c $^
+       $(GCC) -o $@ $(BINDIR)/links-xml-to-plain.c -lfl
+
+$(BINDIR)/links-plain-to-binary: $(SRCDIR)/links-plain-to-binary.c titles.txt
+       mkdir -p $(BINDIR)
+       $(GCC) -o $@ $<
+
+$(BINDIR)/links-outgoing-to-incoming: $(SRCDIR)/links-outgoing-to-incoming.c
+       mkdir -p $(BINDIR)
+       $(GCC) -o $@ $<
+
+
+
+dbfiles: links-incoming.bin titles-sorted.txt
+
+enwiki-links-plain.txt: $(BINDIR)/links-xml-to-plain enwiki-links.xml.lzo
+       # Convert to plain text
+       lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | $(BINDIR)/links-xml-to-plain > enwiki-links-plain.txt
+
+titles.txt: enwiki-links-plain.txt
+       # Extract titles
+       grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt
+
+links-outgoing.bin: $(BINDIR)/links-plain-to-binary titles.txt
+       # This also produces titles-sorted.txt
+       $(BINDIR)/links-plain-to-binary
+       # Update timestamp so make does not rebuild links-outgoing.bin
+       touch titles-sorted.txt
+
+titles-sorted.txt: links-outgoing.bin
+       # Generated in the same step as links-outgoing.bin
+
+links-incoming.bin: $(BINDIR)/links-outgoing-to-incoming links-outgoing.bin
+       $(BINDIR)/links-outgoing-to-incoming
diff --git a/convert-to-plain.sh b/convert-to-plain.sh
deleted file mode 100644 (file)
index e7bd378..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Compile flex program to regex-convert the XML to plain text
-#https://unix.stackexchange.com/a/413684
-flex -o links-xml-to-plain.c links-xml-to-plain.l
-gcc -O3  -o links-xml-to-plain links-xml-to-plain.c -lfl
-
-# Convert to plain text
-lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt
-
-# Extract titles
-grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt
-
-
-
-gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c
-time ./links-plain-to-binary
-
-gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c
-time ./links-outgoing-to-incoming
diff --git a/links-outgoing-to-incoming.c b/links-outgoing-to-incoming.c
deleted file mode 100644 (file)
index 2fd853a..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-typedef __uint32_t art_id;
-
-
-int main()
-{
-       FILE *in_file;
-       FILE *out_file;
-
-       art_id titles;
-
-       art_id **linko;
-       art_id *linkos;
-
-       art_id **linki;
-       art_id *linkis;
-
-       art_id link_titles_done = 0;
-
-       art_id i;
-
-
-       /*
-        * Read all outgoing links into memory
-        */
-
-       in_file = fopen("links-outgoing.bin", "rb");
-
-       fread(&titles, sizeof(titles), 1, in_file);
-
-       linko = malloc(titles * sizeof(art_id*));
-       linkos = malloc(titles * sizeof(art_id));
-
-       for (i = 0; i < titles; i++) {
-               art_id j;
-
-               fread(&linkos[i], sizeof(linkos[i]), 1, in_file);
-
-               linko[i] = malloc(linkos[i] * sizeof(linko[i][0]));
-
-               for (j = 0; j < linkos[i]; j++) {
-                       fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file);
-               }
-       }
-       fclose(in_file);
-
-       printf("Outgoing links read.\n");
-
-
-
-       linki = malloc(titles * sizeof(art_id*));
-       linkis = malloc(titles * sizeof(art_id));
-
-       for (i = 0; i < titles; i++) {
-               art_id j;
-
-               for (j = 0; j < linkos[i]; j++) {
-                       art_id x = linko[i][j];
-
-                       linkis[x]++;
-                       linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0]));
-
-                       linki[x][linkis[x] - 1] = i;
-               }
-       }
-
-       printf("Links turned upside down.\n");
-
-
-
-       out_file = fopen("links-incoming.bin", "wb");
-       fwrite(&titles, sizeof(titles), 1, out_file);
-       for (i = 0; i < titles; i++) {
-               art_id j;
-
-               fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file);
-
-               for (j = 0; j < linkis[i]; j++) {
-                       fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file);
-               }
-       }
-       fclose(out_file);
-
-       printf("Incoming links written.\n");
-}
diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c
deleted file mode 100644 (file)
index 005a496..0000000
+++ /dev/null
@@ -1,178 +0,0 @@
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-typedef __uint32_t art_id;
-
-
-static int
-cmpstring_pp_pp(const void *p1, const void *p2)
-{
-       return strcasecmp(* (char * const *) p1, * (char * const *) p2);
-}
-
-static int
-cmpstring_p_pp(const void *p1, const void *p2)
-{
-       return strcasecmp((char * const) p1, * (char * const *) p2);
-}
-
-
-
-
-int main()
-{
-       FILE *in_file;
-       FILE *out_file;
-
-       char **title = NULL;
-       art_id titles = 0;
-       art_id titles_alloc = 0;
-
-       art_id **link;
-       art_id *links;
-
-       art_id link_titles_done = 0;
-
-       art_id i;
-
-
-       /*
-        * Read all titles into memory
-        */
-
-       in_file = fopen("titles.txt", "r");
-       while (!feof(in_file)) {
-               char *in_line = NULL;
-               ssize_t in_line_len = 0;
-               size_t zero = 0;
-
-               if (titles == titles_alloc) {
-                       titles_alloc += 100000;
-                       title = realloc(title, titles_alloc * sizeof(title[0]));
-               }
-
-               in_line_len = getline(&in_line, &zero, in_file);
-
-               /* Ignore empty lines and errors */
-               if (in_line_len < 2) {
-                       continue;
-               }
-
-               /* Delete trailing newline */
-               in_line[in_line_len - 1] = '\0';
-
-               title[titles] = in_line;
-               titles++;
-       }
-       fclose(in_file);
-
-       qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
-
-       printf("Sorting done.\n");
-
-
-
-       link = malloc(titles * sizeof(art_id*));
-       links = malloc(titles * sizeof(art_id));
-
-       in_file = fopen("enwiki-links-plain.txt", "r");
-       while (!feof(in_file)) {
-               char *in_line = NULL;
-               ssize_t in_line_len = 0;
-               size_t zero = 0;
-
-               in_line_len = getline(&in_line, &zero, in_file);
-
-               /* Ignore empty lines and errors */
-               if (in_line_len < 2) {
-                       //printf("%d\n", links[i]);
-                       if (in_line)
-                               free(in_line);
-                       continue;
-               }
-
-               /* Delete trailing newline */
-               in_line[in_line_len - 1] = '\0';
-
-               if (in_line_len > 5
-                   && !memcmp(in_line, "~~~~", 4)) {
-                       /* Title */
-                       char **cur_title = bsearch(&in_line[4], title,
-                                               titles, sizeof(title[0]),
-                                               cmpstring_p_pp);
-
-                       if (!cur_title) {
-                               printf("TITLE NOT FOUND: %s\n", in_line);
-                               assert(cur_title);
-                       }
-
-                       i = cur_title - title;
-                       //printf("%s\n", title[i]);
-
-                       link_titles_done++;
-                       if (0 == (link_titles_done % 100000)) {
-                               printf("%d\n", link_titles_done);
-                       }
-               } else {
-                       /* Link */
-
-                       /* Delete trailing anchor */
-                       strtok(in_line, "#");
-
-                       char **cur_link = bsearch(&in_line[0], title,
-                                               titles, sizeof(title[0]),
-                                               cmpstring_p_pp);
-
-                       if (!cur_link) {
-                               //printf("LINK NOT FOUND: %s\n", in_line);
-                               free(in_line);
-                               continue;
-                       }
-
-                       links[i]++;
-                       link[i] = realloc(link[i], links[i] * sizeof(art_id));
-
-                       link[i][links[i] - 1] = cur_link - title;
-
-                       //printf("%s -- %d\n", title[i], links[i]);
-               }
-
-               free(in_line);
-       }
-       fclose(in_file);
-
-       printf("Links identified.\n");
-
-
-
-       out_file = fopen("titles-sorted.txt", "w");
-       for (i = 0; i < titles; i++) {
-               fputs(title[i], out_file);
-               fputs("\n", out_file);
-       }
-       fclose(out_file);
-
-       printf("Titles written.\n");
-
-
-
-       out_file = fopen("links-outgoing.bin", "wb");
-       fwrite(&titles, sizeof(titles), 1, out_file);
-       for (i = 0; i < titles; i++) {
-               art_id j;
-
-               fwrite(&links[i], sizeof(links[i]), 1, out_file);
-
-               for (j = 0; j < links[i]; j++) {
-                       fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
-               }
-       }
-       fclose(out_file);
-
-       printf("Links written.\n");
-}
diff --git a/links-xml-to-plain.l b/links-xml-to-plain.l
deleted file mode 100644 (file)
index dabe538..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-%%
-\<d\>     ;
-\<\/d\>   ;
-\<p>      ;
-\<\/p\>   printf("\n");
-\<t>      printf("~~~~");
-\<\/t\>   printf("\n");
-\<l>      ;
-\<\/l\>   printf("\n");
-%%
\ No newline at end of file
diff --git a/lookup-incoming.c b/lookup-incoming.c
deleted file mode 100644 (file)
index 2d3fa9c..0000000
+++ /dev/null
@@ -1,257 +0,0 @@
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
-
-
-typedef __uint32_t bitfield_type;
-#define BITS_PER_BITFIELD 32
-#define BITS_PER_BITFIELD_LOG 5
-
-#define BIT_SET(set, d) \
-  ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) )
-#define BIT_ISSET(set, d) \
-  (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) ))
-
-int ANY_BITS_SET(bitfield_type *set, size_t elems)
-{
-       size_t i;
-
-       for (i = 0; i < elems; i++)
-               if (set[i] != 0)
-                       return 1;
-
-       return 0;
-}
-
-
-typedef __uint32_t art_id;
-
-
-
-static int
-cmpstring_p_pp(const void *p1, const void *p2)
-{
-       return strcasecmp((char * const) p1, * (char * const *) p2);
-}
-
-
-
-
-int main(int argc, char **argv)
-{
-       FILE *in_file;
-       FILE *out_file;
-
-       char *title_blob;
-       size_t title_blob_len;
-       char **title = NULL;
-       art_id titles;
-       art_id titles_read = 0;
-
-       art_id *link_blob;
-       size_t link_blob_len;
-       art_id **linki;
-       art_id *linkis;
-
-       char **cur_title;
-       art_id title_id;
-
-       art_id cur_dist;
-
-       art_id i;
-
-       bitfield_type *titles_seen;
-       bitfield_type *titles_prev_round;
-       bitfield_type *titles_this_round;
-       size_t titles_bitfield_elems;
-       size_t titles_bitfield_bytes;
-
-
-       if (argc < 2) {
-               printf("Supply article name to look up articles linking to it.\n");
-               return 1;
-       }
-
-
-       /*
-        * Read all incoming links into memory
-        */
-
-       in_file = fopen("links-incoming.bin", "rb");
-       fseek(in_file, 0, SEEK_END);
-       link_blob_len = ftell(in_file);
-       rewind(in_file);
-       printf("Link blob size: %zd bytes.\n", link_blob_len);
-
-       link_blob = malloc(link_blob_len);
-       if (!link_blob) {
-               printf("Failed to allocate memory.\n");
-               return 1;
-       }
-
-       if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) {
-               printf("Failed to read entire file in one go.\n");
-               return 1;
-       }
-       printf("Link blob read (%zd bytes).\n", ftell(in_file));
-       fclose(in_file);
-
-       titles = *link_blob;
-       link_blob++;
-
-       linki = malloc(titles * sizeof(art_id*));
-       linkis = malloc(titles * sizeof(art_id));
-
-       for (i = 0; i < titles; i++) {
-               art_id j;
-
-               linkis[i] = *link_blob;
-               link_blob++;
-
-               linki[i] = link_blob;
-               link_blob += linkis[i];
-       }
-       printf("Incoming links prepared.\n");
-
-
-
-
-
-       /*
-        * Read all titles into memory
-        */
-
-
-
-       in_file = fopen("titles-sorted.txt", "r");
-       fseek(in_file, 0, SEEK_END);
-       title_blob_len = ftell(in_file);
-       rewind(in_file);
-       printf("Title blob size: %zd bytes.\n", title_blob_len);
-
-       title_blob = malloc(title_blob_len + 1);
-       if (!title_blob) {
-               printf("Failed to allocate memory.\n");
-               return 1;
-       }
-
-       if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) {
-               printf("Failed to read entire file in one go.\n");
-               return 1;
-       }
-       printf("Title blob read (%zd bytes).\n", ftell(in_file));
-       fclose(in_file);
-
-       title_blob[title_blob_len] = '\0';
-
-       title = malloc((titles + 1) * sizeof(title[0]));
-
-       title[0] = title_blob;
-       titles_read = 1;
-       while (title_blob = strchr(title_blob, '\n')) {
-               title_blob[0] = '\0';
-               title_blob++;
-               title[titles_read] = title_blob;
-               titles_read++;
-       }
-
-       /* Last title will be an empty string, as the file ends with a newline. */
-       printf("Titles prepared: %d of %d.\n", titles_read - 1, titles);
-
-
-
-
-       /* Look up article ID */
-       cur_title = bsearch(argv[1], title,
-                               titles, sizeof(title[0]),
-                               cmpstring_p_pp);
-
-       if (!cur_title) {
-               printf("TITLE NOT FOUND: %s\n", argv[1]);
-               return 1;
-       }
-
-       title_id = cur_title - title;
-
-
-       #if 0
-       /* Print first degree incoming edges */
-       printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
-       for (i = 0; i < linkis[title_id]; i++) {
-               art_id x = linki[title_id][i];
-               printf("  %s\n", title[x]);
-       }
-       #endif
-
-
-
-       printf("\n\n\n\nBuilding table of distances...\n\n");
-
-       titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD);
-       titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type);
-
-BENCHMARK:
-
-       titles_seen = calloc(titles_bitfield_bytes, 1);
-       titles_prev_round = NULL;
-       titles_this_round = calloc(titles_bitfield_bytes, 1);
-
-       cur_dist = 0;
-
-       BIT_SET(titles_seen, title_id);
-       BIT_SET(titles_this_round, title_id);
-
-       while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) {
-               art_id i;
-
-               cur_dist++;
-               printf("Checking distance %d\n", cur_dist);
-
-               free(titles_prev_round); /* free is safe to call on NULL */
-               titles_prev_round = titles_this_round;
-               titles_this_round = calloc(titles_bitfield_bytes, 1);
-
-               for (i = 0; i < titles; i++) {
-                       int j;
-
-                       if (!BIT_ISSET(titles_prev_round, i))
-                               continue;
-
-                       for (j = 0; j < linkis[i]; j++) {
-                               art_id x = linki[i][j];
-                               BIT_SET(titles_this_round, x);
-                       }
-               }
-
-               for (i = 0; i < titles_bitfield_elems; i++) {
-                       titles_this_round[i] &= ~titles_seen[i];
-                       titles_seen[i] |= titles_this_round[i];
-               }
-       }
-
-       /* Result is in titles_prev_round */
-
-       cur_dist -= 1;
-
-       printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
-       for (i = 0; i < titles; i++) {
-               if (BIT_ISSET(titles_prev_round, i)) {
-                       printf("  %s\n", title[i]);
-               }
-       }
-
-
-       free(titles_seen);
-       free(titles_prev_round);
-       free(titles_this_round);
-
-       //goto BENCHMARK;
-
-       return 0;
-}
diff --git a/src/links-outgoing-to-incoming.c b/src/links-outgoing-to-incoming.c
new file mode 100644 (file)
index 0000000..2fd853a
--- /dev/null
@@ -0,0 +1,92 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+typedef __uint32_t art_id;
+
+
+int main()
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       art_id titles;
+
+       art_id **linko;
+       art_id *linkos;
+
+       art_id **linki;
+       art_id *linkis;
+
+       art_id link_titles_done = 0;
+
+       art_id i;
+
+
+       /*
+        * Read all outgoing links into memory
+        */
+
+       in_file = fopen("links-outgoing.bin", "rb");
+
+       fread(&titles, sizeof(titles), 1, in_file);
+
+       linko = malloc(titles * sizeof(art_id*));
+       linkos = malloc(titles * sizeof(art_id));
+
+       for (i = 0; i < titles; i++) {
+               art_id j;
+
+               fread(&linkos[i], sizeof(linkos[i]), 1, in_file);
+
+               linko[i] = malloc(linkos[i] * sizeof(linko[i][0]));
+
+               for (j = 0; j < linkos[i]; j++) {
+                       fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file);
+               }
+       }
+       fclose(in_file);
+
+       printf("Outgoing links read.\n");
+
+
+
+       linki = malloc(titles * sizeof(art_id*));
+       linkis = malloc(titles * sizeof(art_id));
+
+       for (i = 0; i < titles; i++) {
+               art_id j;
+
+               for (j = 0; j < linkos[i]; j++) {
+                       art_id x = linko[i][j];
+
+                       linkis[x]++;
+                       linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0]));
+
+                       linki[x][linkis[x] - 1] = i;
+               }
+       }
+
+       printf("Links turned upside down.\n");
+
+
+
+       out_file = fopen("links-incoming.bin", "wb");
+       fwrite(&titles, sizeof(titles), 1, out_file);
+       for (i = 0; i < titles; i++) {
+               art_id j;
+
+               fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file);
+
+               for (j = 0; j < linkis[i]; j++) {
+                       fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file);
+               }
+       }
+       fclose(out_file);
+
+       printf("Incoming links written.\n");
+}
diff --git a/src/links-plain-to-binary.c b/src/links-plain-to-binary.c
new file mode 100644 (file)
index 0000000..005a496
--- /dev/null
@@ -0,0 +1,178 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+typedef __uint32_t art_id;
+
+
+static int
+cmpstring_pp_pp(const void *p1, const void *p2)
+{
+       return strcasecmp(* (char * const *) p1, * (char * const *) p2);
+}
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+       return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main()
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       char **title = NULL;
+       art_id titles = 0;
+       art_id titles_alloc = 0;
+
+       art_id **link;
+       art_id *links;
+
+       art_id link_titles_done = 0;
+
+       art_id i;
+
+
+       /*
+        * Read all titles into memory
+        */
+
+       in_file = fopen("titles.txt", "r");
+       while (!feof(in_file)) {
+               char *in_line = NULL;
+               ssize_t in_line_len = 0;
+               size_t zero = 0;
+
+               if (titles == titles_alloc) {
+                       titles_alloc += 100000;
+                       title = realloc(title, titles_alloc * sizeof(title[0]));
+               }
+
+               in_line_len = getline(&in_line, &zero, in_file);
+
+               /* Ignore empty lines and errors */
+               if (in_line_len < 2) {
+                       continue;
+               }
+
+               /* Delete trailing newline */
+               in_line[in_line_len - 1] = '\0';
+
+               title[titles] = in_line;
+               titles++;
+       }
+       fclose(in_file);
+
+       qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
+
+       printf("Sorting done.\n");
+
+
+
+       link = malloc(titles * sizeof(art_id*));
+       links = malloc(titles * sizeof(art_id));
+
+       in_file = fopen("enwiki-links-plain.txt", "r");
+       while (!feof(in_file)) {
+               char *in_line = NULL;
+               ssize_t in_line_len = 0;
+               size_t zero = 0;
+
+               in_line_len = getline(&in_line, &zero, in_file);
+
+               /* Ignore empty lines and errors */
+               if (in_line_len < 2) {
+                       //printf("%d\n", links[i]);
+                       if (in_line)
+                               free(in_line);
+                       continue;
+               }
+
+               /* Delete trailing newline */
+               in_line[in_line_len - 1] = '\0';
+
+               if (in_line_len > 5
+                   && !memcmp(in_line, "~~~~", 4)) {
+                       /* Title */
+                       char **cur_title = bsearch(&in_line[4], title,
+                                               titles, sizeof(title[0]),
+                                               cmpstring_p_pp);
+
+                       if (!cur_title) {
+                               printf("TITLE NOT FOUND: %s\n", in_line);
+                               assert(cur_title);
+                       }
+
+                       i = cur_title - title;
+                       //printf("%s\n", title[i]);
+
+                       link_titles_done++;
+                       if (0 == (link_titles_done % 100000)) {
+                               printf("%d\n", link_titles_done);
+                       }
+               } else {
+                       /* Link */
+
+                       /* Delete trailing anchor */
+                       strtok(in_line, "#");
+
+                       char **cur_link = bsearch(&in_line[0], title,
+                                               titles, sizeof(title[0]),
+                                               cmpstring_p_pp);
+
+                       if (!cur_link) {
+                               //printf("LINK NOT FOUND: %s\n", in_line);
+                               free(in_line);
+                               continue;
+                       }
+
+                       links[i]++;
+                       link[i] = realloc(link[i], links[i] * sizeof(art_id));
+
+                       link[i][links[i] - 1] = cur_link - title;
+
+                       //printf("%s -- %d\n", title[i], links[i]);
+               }
+
+               free(in_line);
+       }
+       fclose(in_file);
+
+       printf("Links identified.\n");
+
+
+
+       out_file = fopen("titles-sorted.txt", "w");
+       for (i = 0; i < titles; i++) {
+               fputs(title[i], out_file);
+               fputs("\n", out_file);
+       }
+       fclose(out_file);
+
+       printf("Titles written.\n");
+
+
+
+       out_file = fopen("links-outgoing.bin", "wb");
+       fwrite(&titles, sizeof(titles), 1, out_file);
+       for (i = 0; i < titles; i++) {
+               art_id j;
+
+               fwrite(&links[i], sizeof(links[i]), 1, out_file);
+
+               for (j = 0; j < links[i]; j++) {
+                       fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
+               }
+       }
+       fclose(out_file);
+
+       printf("Links written.\n");
+}
diff --git a/src/links-xml-to-plain.l b/src/links-xml-to-plain.l
new file mode 100644 (file)
index 0000000..dabe538
--- /dev/null
@@ -0,0 +1,10 @@
+%%
+\<d\>     ;
+\<\/d\>   ;
+\<p>      ;
+\<\/p\>   printf("\n");
+\<t>      printf("~~~~");
+\<\/t\>   printf("\n");
+\<l>      ;
+\<\/l\>   printf("\n");
+%%
\ No newline at end of file
diff --git a/src/lookup-incoming.c b/src/lookup-incoming.c
new file mode 100644 (file)
index 0000000..2d3fa9c
--- /dev/null
@@ -0,0 +1,257 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
+
+
+typedef __uint32_t bitfield_type;
+#define BITS_PER_BITFIELD 32
+#define BITS_PER_BITFIELD_LOG 5
+
+#define BIT_SET(set, d) \
+  ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) )
+#define BIT_ISSET(set, d) \
+  (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) ))
+
+int ANY_BITS_SET(bitfield_type *set, size_t elems)
+{
+       size_t i;
+
+       for (i = 0; i < elems; i++)
+               if (set[i] != 0)
+                       return 1;
+
+       return 0;
+}
+
+
+typedef __uint32_t art_id;
+
+
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+       return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main(int argc, char **argv)
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       char *title_blob;
+       size_t title_blob_len;
+       char **title = NULL;
+       art_id titles;
+       art_id titles_read = 0;
+
+       art_id *link_blob;
+       size_t link_blob_len;
+       art_id **linki;
+       art_id *linkis;
+
+       char **cur_title;
+       art_id title_id;
+
+       art_id cur_dist;
+
+       art_id i;
+
+       bitfield_type *titles_seen;
+       bitfield_type *titles_prev_round;
+       bitfield_type *titles_this_round;
+       size_t titles_bitfield_elems;
+       size_t titles_bitfield_bytes;
+
+
+       if (argc < 2) {
+               printf("Supply article name to look up articles linking to it.\n");
+               return 1;
+       }
+
+
+       /*
+        * Read all incoming links into memory
+        */
+
+       in_file = fopen("links-incoming.bin", "rb");
+       fseek(in_file, 0, SEEK_END);
+       link_blob_len = ftell(in_file);
+       rewind(in_file);
+       printf("Link blob size: %zd bytes.\n", link_blob_len);
+
+       link_blob = malloc(link_blob_len);
+       if (!link_blob) {
+               printf("Failed to allocate memory.\n");
+               return 1;
+       }
+
+       if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) {
+               printf("Failed to read entire file in one go.\n");
+               return 1;
+       }
+       printf("Link blob read (%zd bytes).\n", ftell(in_file));
+       fclose(in_file);
+
+       titles = *link_blob;
+       link_blob++;
+
+       linki = malloc(titles * sizeof(art_id*));
+       linkis = malloc(titles * sizeof(art_id));
+
+       for (i = 0; i < titles; i++) {
+               art_id j;
+
+               linkis[i] = *link_blob;
+               link_blob++;
+
+               linki[i] = link_blob;
+               link_blob += linkis[i];
+       }
+       printf("Incoming links prepared.\n");
+
+
+
+
+
+       /*
+        * Read all titles into memory
+        */
+
+
+
+       in_file = fopen("titles-sorted.txt", "r");
+       fseek(in_file, 0, SEEK_END);
+       title_blob_len = ftell(in_file);
+       rewind(in_file);
+       printf("Title blob size: %zd bytes.\n", title_blob_len);
+
+       title_blob = malloc(title_blob_len + 1);
+       if (!title_blob) {
+               printf("Failed to allocate memory.\n");
+               return 1;
+       }
+
+       if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) {
+               printf("Failed to read entire file in one go.\n");
+               return 1;
+       }
+       printf("Title blob read (%zd bytes).\n", ftell(in_file));
+       fclose(in_file);
+
+       title_blob[title_blob_len] = '\0';
+
+       title = malloc((titles + 1) * sizeof(title[0]));
+
+       title[0] = title_blob;
+       titles_read = 1;
+       while (title_blob = strchr(title_blob, '\n')) {
+               title_blob[0] = '\0';
+               title_blob++;
+               title[titles_read] = title_blob;
+               titles_read++;
+       }
+
+       /* Last title will be an empty string, as the file ends with a newline. */
+       printf("Titles prepared: %d of %d.\n", titles_read - 1, titles);
+
+
+
+
+       /* Look up article ID */
+       cur_title = bsearch(argv[1], title,
+                               titles, sizeof(title[0]),
+                               cmpstring_p_pp);
+
+       if (!cur_title) {
+               printf("TITLE NOT FOUND: %s\n", argv[1]);
+               return 1;
+       }
+
+       title_id = cur_title - title;
+
+
+       #if 0
+       /* Print first degree incoming edges */
+       printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
+       for (i = 0; i < linkis[title_id]; i++) {
+               art_id x = linki[title_id][i];
+               printf("  %s\n", title[x]);
+       }
+       #endif
+
+
+
+       printf("\n\n\n\nBuilding table of distances...\n\n");
+
+       titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD);
+       titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type);
+
+BENCHMARK:
+
+       titles_seen = calloc(titles_bitfield_bytes, 1);
+       titles_prev_round = NULL;
+       titles_this_round = calloc(titles_bitfield_bytes, 1);
+
+       cur_dist = 0;
+
+       BIT_SET(titles_seen, title_id);
+       BIT_SET(titles_this_round, title_id);
+
+       while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) {
+               art_id i;
+
+               cur_dist++;
+               printf("Checking distance %d\n", cur_dist);
+
+               free(titles_prev_round); /* free is safe to call on NULL */
+               titles_prev_round = titles_this_round;
+               titles_this_round = calloc(titles_bitfield_bytes, 1);
+
+               for (i = 0; i < titles; i++) {
+                       int j;
+
+                       if (!BIT_ISSET(titles_prev_round, i))
+                               continue;
+
+                       for (j = 0; j < linkis[i]; j++) {
+                               art_id x = linki[i][j];
+                               BIT_SET(titles_this_round, x);
+                       }
+               }
+
+               for (i = 0; i < titles_bitfield_elems; i++) {
+                       titles_this_round[i] &= ~titles_seen[i];
+                       titles_seen[i] |= titles_this_round[i];
+               }
+       }
+
+       /* Result is in titles_prev_round */
+
+       cur_dist -= 1;
+
+       printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
+       for (i = 0; i < titles; i++) {
+               if (BIT_ISSET(titles_prev_round, i)) {
+                       printf("  %s\n", title[i]);
+               }
+       }
+
+
+       free(titles_seen);
+       free(titles_prev_round);
+       free(titles_this_round);
+
+       //goto BENCHMARK;
+
+       return 0;
+}