--- /dev/null
+bin/
+enwiki-links-plain.txt
+titles.txt
+titles-sorted.txt
+links-outgoing.bin
+links-incoming.bin
--- /dev/null
+GCC=gcc -O3 -g
+BINDIR=bin
+SRCDIR=src
+
+all: lookuptools preptools dbfiles
+
+
+$(BINDIR):
+ mkdir -p $(BINDIR)
+
+
+lookuptools: $(BINDIR)/lookup-incoming
+
+$(BINDIR)/lookup-incoming: $(SRCDIR)/lookup-incoming.c
+ mkdir -p $(BINDIR)
+ $(GCC) -o $@ $<
+
+
+
+preptools: $(BINDIR)/links-xml-to-plain \
+ $(BINDIR)/links-outgoing-to-incoming \
+ $(BINDIR)/links-plain-to-binary
+
+$(BINDIR)/links-xml-to-plain: $(SRCDIR)/links-xml-to-plain.l
+ mkdir -p $(BINDIR)
+ # Compile flex program to regex-convert the XML to plain text
+ #https://unix.stackexchange.com/a/413684
+ flex -o $(BINDIR)/links-xml-to-plain.c $^
+ $(GCC) -o $@ $(BINDIR)/links-xml-to-plain.c -lfl
+
+$(BINDIR)/links-plain-to-binary: $(SRCDIR)/links-plain-to-binary.c titles.txt
+ mkdir -p $(BINDIR)
+ $(GCC) -o $@ $<
+
+$(BINDIR)/links-outgoing-to-incoming: $(SRCDIR)/links-outgoing-to-incoming.c
+ mkdir -p $(BINDIR)
+ $(GCC) -o $@ $<
+
+
+
+dbfiles: links-incoming.bin titles-sorted.txt
+
+enwiki-links-plain.txt: $(BINDIR)/links-xml-to-plain enwiki-links.xml.lzo
+ # Convert to plain text
+ lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | $(BINDIR)/links-xml-to-plain > enwiki-links-plain.txt
+
+titles.txt: enwiki-links-plain.txt
+ # Extract titles
+ grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt
+
+links-outgoing.bin: $(BINDIR)/links-plain-to-binary titles.txt
+ # This also produces titles-sorted.txt
+ $(BINDIR)/links-plain-to-binary
+ # Update timestamp so make does not rebuild links-outgoing.bin
+ touch titles-sorted.txt
+
+titles-sorted.txt: links-outgoing.bin
+ # Generated in the same step as links-outgoing.bin
+
+links-incoming.bin: $(BINDIR)/links-outgoing-to-incoming links-outgoing.bin
+ $(BINDIR)/links-outgoing-to-incoming
+++ /dev/null
-#!/bin/bash
-
-# Compile flex program to regex-convert the XML to plain text
-#https://unix.stackexchange.com/a/413684
-flex -o links-xml-to-plain.c links-xml-to-plain.l
-gcc -O3 -o links-xml-to-plain links-xml-to-plain.c -lfl
-
-# Convert to plain text
-lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt
-
-# Extract titles
-grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt
-
-
-
-gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c
-time ./links-plain-to-binary
-
-gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c
-time ./links-outgoing-to-incoming
+++ /dev/null
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-typedef __uint32_t art_id;
-
-
-int main()
-{
- FILE *in_file;
- FILE *out_file;
-
- art_id titles;
-
- art_id **linko;
- art_id *linkos;
-
- art_id **linki;
- art_id *linkis;
-
- art_id link_titles_done = 0;
-
- art_id i;
-
-
- /*
- * Read all outgoing links into memory
- */
-
- in_file = fopen("links-outgoing.bin", "rb");
-
- fread(&titles, sizeof(titles), 1, in_file);
-
- linko = malloc(titles * sizeof(art_id*));
- linkos = malloc(titles * sizeof(art_id));
-
- for (i = 0; i < titles; i++) {
- art_id j;
-
- fread(&linkos[i], sizeof(linkos[i]), 1, in_file);
-
- linko[i] = malloc(linkos[i] * sizeof(linko[i][0]));
-
- for (j = 0; j < linkos[i]; j++) {
- fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file);
- }
- }
- fclose(in_file);
-
- printf("Outgoing links read.\n");
-
-
-
- linki = malloc(titles * sizeof(art_id*));
- linkis = malloc(titles * sizeof(art_id));
-
- for (i = 0; i < titles; i++) {
- art_id j;
-
- for (j = 0; j < linkos[i]; j++) {
- art_id x = linko[i][j];
-
- linkis[x]++;
- linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0]));
-
- linki[x][linkis[x] - 1] = i;
- }
- }
-
- printf("Links turned upside down.\n");
-
-
-
- out_file = fopen("links-incoming.bin", "wb");
- fwrite(&titles, sizeof(titles), 1, out_file);
- for (i = 0; i < titles; i++) {
- art_id j;
-
- fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file);
-
- for (j = 0; j < linkis[i]; j++) {
- fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file);
- }
- }
- fclose(out_file);
-
- printf("Incoming links written.\n");
-}
+++ /dev/null
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-typedef __uint32_t art_id;
-
-
-static int
-cmpstring_pp_pp(const void *p1, const void *p2)
-{
- return strcasecmp(* (char * const *) p1, * (char * const *) p2);
-}
-
-static int
-cmpstring_p_pp(const void *p1, const void *p2)
-{
- return strcasecmp((char * const) p1, * (char * const *) p2);
-}
-
-
-
-
-int main()
-{
- FILE *in_file;
- FILE *out_file;
-
- char **title = NULL;
- art_id titles = 0;
- art_id titles_alloc = 0;
-
- art_id **link;
- art_id *links;
-
- art_id link_titles_done = 0;
-
- art_id i;
-
-
- /*
- * Read all titles into memory
- */
-
- in_file = fopen("titles.txt", "r");
- while (!feof(in_file)) {
- char *in_line = NULL;
- ssize_t in_line_len = 0;
- size_t zero = 0;
-
- if (titles == titles_alloc) {
- titles_alloc += 100000;
- title = realloc(title, titles_alloc * sizeof(title[0]));
- }
-
- in_line_len = getline(&in_line, &zero, in_file);
-
- /* Ignore empty lines and errors */
- if (in_line_len < 2) {
- continue;
- }
-
- /* Delete trailing newline */
- in_line[in_line_len - 1] = '\0';
-
- title[titles] = in_line;
- titles++;
- }
- fclose(in_file);
-
- qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
-
- printf("Sorting done.\n");
-
-
-
- link = malloc(titles * sizeof(art_id*));
- links = malloc(titles * sizeof(art_id));
-
- in_file = fopen("enwiki-links-plain.txt", "r");
- while (!feof(in_file)) {
- char *in_line = NULL;
- ssize_t in_line_len = 0;
- size_t zero = 0;
-
- in_line_len = getline(&in_line, &zero, in_file);
-
- /* Ignore empty lines and errors */
- if (in_line_len < 2) {
- //printf("%d\n", links[i]);
- if (in_line)
- free(in_line);
- continue;
- }
-
- /* Delete trailing newline */
- in_line[in_line_len - 1] = '\0';
-
- if (in_line_len > 5
- && !memcmp(in_line, "~~~~", 4)) {
- /* Title */
- char **cur_title = bsearch(&in_line[4], title,
- titles, sizeof(title[0]),
- cmpstring_p_pp);
-
- if (!cur_title) {
- printf("TITLE NOT FOUND: %s\n", in_line);
- assert(cur_title);
- }
-
- i = cur_title - title;
- //printf("%s\n", title[i]);
-
- link_titles_done++;
- if (0 == (link_titles_done % 100000)) {
- printf("%d\n", link_titles_done);
- }
- } else {
- /* Link */
-
- /* Delete trailing anchor */
- strtok(in_line, "#");
-
- char **cur_link = bsearch(&in_line[0], title,
- titles, sizeof(title[0]),
- cmpstring_p_pp);
-
- if (!cur_link) {
- //printf("LINK NOT FOUND: %s\n", in_line);
- free(in_line);
- continue;
- }
-
- links[i]++;
- link[i] = realloc(link[i], links[i] * sizeof(art_id));
-
- link[i][links[i] - 1] = cur_link - title;
-
- //printf("%s -- %d\n", title[i], links[i]);
- }
-
- free(in_line);
- }
- fclose(in_file);
-
- printf("Links identified.\n");
-
-
-
- out_file = fopen("titles-sorted.txt", "w");
- for (i = 0; i < titles; i++) {
- fputs(title[i], out_file);
- fputs("\n", out_file);
- }
- fclose(out_file);
-
- printf("Titles written.\n");
-
-
-
- out_file = fopen("links-outgoing.bin", "wb");
- fwrite(&titles, sizeof(titles), 1, out_file);
- for (i = 0; i < titles; i++) {
- art_id j;
-
- fwrite(&links[i], sizeof(links[i]), 1, out_file);
-
- for (j = 0; j < links[i]; j++) {
- fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
- }
- }
- fclose(out_file);
-
- printf("Links written.\n");
-}
+++ /dev/null
-%%
-\<d\> ;
-\<\/d\> ;
-\<p> ;
-\<\/p\> printf("\n");
-\<t> printf("~~~~");
-\<\/t\> printf("\n");
-\<l> ;
-\<\/l\> printf("\n");
-%%
\ No newline at end of file
+++ /dev/null
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
-
-
-typedef __uint32_t bitfield_type;
-#define BITS_PER_BITFIELD 32
-#define BITS_PER_BITFIELD_LOG 5
-
-#define BIT_SET(set, d) \
- ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) )
-#define BIT_ISSET(set, d) \
- (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) ))
-
-int ANY_BITS_SET(bitfield_type *set, size_t elems)
-{
- size_t i;
-
- for (i = 0; i < elems; i++)
- if (set[i] != 0)
- return 1;
-
- return 0;
-}
-
-
-typedef __uint32_t art_id;
-
-
-
-static int
-cmpstring_p_pp(const void *p1, const void *p2)
-{
- return strcasecmp((char * const) p1, * (char * const *) p2);
-}
-
-
-
-
-int main(int argc, char **argv)
-{
- FILE *in_file;
- FILE *out_file;
-
- char *title_blob;
- size_t title_blob_len;
- char **title = NULL;
- art_id titles;
- art_id titles_read = 0;
-
- art_id *link_blob;
- size_t link_blob_len;
- art_id **linki;
- art_id *linkis;
-
- char **cur_title;
- art_id title_id;
-
- art_id cur_dist;
-
- art_id i;
-
- bitfield_type *titles_seen;
- bitfield_type *titles_prev_round;
- bitfield_type *titles_this_round;
- size_t titles_bitfield_elems;
- size_t titles_bitfield_bytes;
-
-
- if (argc < 2) {
- printf("Supply article name to look up articles linking to it.\n");
- return 1;
- }
-
-
- /*
- * Read all incoming links into memory
- */
-
- in_file = fopen("links-incoming.bin", "rb");
- fseek(in_file, 0, SEEK_END);
- link_blob_len = ftell(in_file);
- rewind(in_file);
- printf("Link blob size: %zd bytes.\n", link_blob_len);
-
- link_blob = malloc(link_blob_len);
- if (!link_blob) {
- printf("Failed to allocate memory.\n");
- return 1;
- }
-
- if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) {
- printf("Failed to read entire file in one go.\n");
- return 1;
- }
- printf("Link blob read (%zd bytes).\n", ftell(in_file));
- fclose(in_file);
-
- titles = *link_blob;
- link_blob++;
-
- linki = malloc(titles * sizeof(art_id*));
- linkis = malloc(titles * sizeof(art_id));
-
- for (i = 0; i < titles; i++) {
- art_id j;
-
- linkis[i] = *link_blob;
- link_blob++;
-
- linki[i] = link_blob;
- link_blob += linkis[i];
- }
- printf("Incoming links prepared.\n");
-
-
-
-
-
- /*
- * Read all titles into memory
- */
-
-
-
- in_file = fopen("titles-sorted.txt", "r");
- fseek(in_file, 0, SEEK_END);
- title_blob_len = ftell(in_file);
- rewind(in_file);
- printf("Title blob size: %zd bytes.\n", title_blob_len);
-
- title_blob = malloc(title_blob_len + 1);
- if (!title_blob) {
- printf("Failed to allocate memory.\n");
- return 1;
- }
-
- if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) {
- printf("Failed to read entire file in one go.\n");
- return 1;
- }
- printf("Title blob read (%zd bytes).\n", ftell(in_file));
- fclose(in_file);
-
- title_blob[title_blob_len] = '\0';
-
- title = malloc((titles + 1) * sizeof(title[0]));
-
- title[0] = title_blob;
- titles_read = 1;
- while (title_blob = strchr(title_blob, '\n')) {
- title_blob[0] = '\0';
- title_blob++;
- title[titles_read] = title_blob;
- titles_read++;
- }
-
- /* Last title will be an empty string, as the file ends with a newline. */
- printf("Titles prepared: %d of %d.\n", titles_read - 1, titles);
-
-
-
-
- /* Look up article ID */
- cur_title = bsearch(argv[1], title,
- titles, sizeof(title[0]),
- cmpstring_p_pp);
-
- if (!cur_title) {
- printf("TITLE NOT FOUND: %s\n", argv[1]);
- return 1;
- }
-
- title_id = cur_title - title;
-
-
- #if 0
- /* Print first degree incoming edges */
- printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
- for (i = 0; i < linkis[title_id]; i++) {
- art_id x = linki[title_id][i];
- printf(" %s\n", title[x]);
- }
- #endif
-
-
-
- printf("\n\n\n\nBuilding table of distances...\n\n");
-
- titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD);
- titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type);
-
-BENCHMARK:
-
- titles_seen = calloc(titles_bitfield_bytes, 1);
- titles_prev_round = NULL;
- titles_this_round = calloc(titles_bitfield_bytes, 1);
-
- cur_dist = 0;
-
- BIT_SET(titles_seen, title_id);
- BIT_SET(titles_this_round, title_id);
-
- while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) {
- art_id i;
-
- cur_dist++;
- printf("Checking distance %d\n", cur_dist);
-
- free(titles_prev_round); /* free is safe to call on NULL */
- titles_prev_round = titles_this_round;
- titles_this_round = calloc(titles_bitfield_bytes, 1);
-
- for (i = 0; i < titles; i++) {
- int j;
-
- if (!BIT_ISSET(titles_prev_round, i))
- continue;
-
- for (j = 0; j < linkis[i]; j++) {
- art_id x = linki[i][j];
- BIT_SET(titles_this_round, x);
- }
- }
-
- for (i = 0; i < titles_bitfield_elems; i++) {
- titles_this_round[i] &= ~titles_seen[i];
- titles_seen[i] |= titles_this_round[i];
- }
- }
-
- /* Result is in titles_prev_round */
-
- cur_dist -= 1;
-
- printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
- for (i = 0; i < titles; i++) {
- if (BIT_ISSET(titles_prev_round, i)) {
- printf(" %s\n", title[i]);
- }
- }
-
-
- free(titles_seen);
- free(titles_prev_round);
- free(titles_this_round);
-
- //goto BENCHMARK;
-
- return 0;
-}
--- /dev/null
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+typedef __uint32_t art_id;
+
+
+int main()
+{
+ FILE *in_file;
+ FILE *out_file;
+
+ art_id titles;
+
+ art_id **linko;
+ art_id *linkos;
+
+ art_id **linki;
+ art_id *linkis;
+
+ art_id link_titles_done = 0;
+
+ art_id i;
+
+
+ /*
+ * Read all outgoing links into memory
+ */
+
+ in_file = fopen("links-outgoing.bin", "rb");
+
+ fread(&titles, sizeof(titles), 1, in_file);
+
+ linko = malloc(titles * sizeof(art_id*));
+ linkos = malloc(titles * sizeof(art_id));
+
+ for (i = 0; i < titles; i++) {
+ art_id j;
+
+ fread(&linkos[i], sizeof(linkos[i]), 1, in_file);
+
+ linko[i] = malloc(linkos[i] * sizeof(linko[i][0]));
+
+ for (j = 0; j < linkos[i]; j++) {
+ fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file);
+ }
+ }
+ fclose(in_file);
+
+ printf("Outgoing links read.\n");
+
+
+
+ linki = malloc(titles * sizeof(art_id*));
+ linkis = malloc(titles * sizeof(art_id));
+
+ for (i = 0; i < titles; i++) {
+ art_id j;
+
+ for (j = 0; j < linkos[i]; j++) {
+ art_id x = linko[i][j];
+
+ linkis[x]++;
+ linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0]));
+
+ linki[x][linkis[x] - 1] = i;
+ }
+ }
+
+ printf("Links turned upside down.\n");
+
+
+
+ out_file = fopen("links-incoming.bin", "wb");
+ fwrite(&titles, sizeof(titles), 1, out_file);
+ for (i = 0; i < titles; i++) {
+ art_id j;
+
+ fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file);
+
+ for (j = 0; j < linkis[i]; j++) {
+ fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file);
+ }
+ }
+ fclose(out_file);
+
+ printf("Incoming links written.\n");
+}
--- /dev/null
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+typedef __uint32_t art_id;
+
+
+static int
+cmpstring_pp_pp(const void *p1, const void *p2)
+{
+ return strcasecmp(* (char * const *) p1, * (char * const *) p2);
+}
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+ return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main()
+{
+ FILE *in_file;
+ FILE *out_file;
+
+ char **title = NULL;
+ art_id titles = 0;
+ art_id titles_alloc = 0;
+
+ art_id **link;
+ art_id *links;
+
+ art_id link_titles_done = 0;
+
+ art_id i;
+
+
+ /*
+ * Read all titles into memory
+ */
+
+ in_file = fopen("titles.txt", "r");
+ while (!feof(in_file)) {
+ char *in_line = NULL;
+ ssize_t in_line_len = 0;
+ size_t zero = 0;
+
+ if (titles == titles_alloc) {
+ titles_alloc += 100000;
+ title = realloc(title, titles_alloc * sizeof(title[0]));
+ }
+
+ in_line_len = getline(&in_line, &zero, in_file);
+
+ /* Ignore empty lines and errors */
+ if (in_line_len < 2) {
+ continue;
+ }
+
+ /* Delete trailing newline */
+ in_line[in_line_len - 1] = '\0';
+
+ title[titles] = in_line;
+ titles++;
+ }
+ fclose(in_file);
+
+ qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
+
+ printf("Sorting done.\n");
+
+
+
+ link = malloc(titles * sizeof(art_id*));
+ links = malloc(titles * sizeof(art_id));
+
+ in_file = fopen("enwiki-links-plain.txt", "r");
+ while (!feof(in_file)) {
+ char *in_line = NULL;
+ ssize_t in_line_len = 0;
+ size_t zero = 0;
+
+ in_line_len = getline(&in_line, &zero, in_file);
+
+ /* Ignore empty lines and errors */
+ if (in_line_len < 2) {
+ //printf("%d\n", links[i]);
+ if (in_line)
+ free(in_line);
+ continue;
+ }
+
+ /* Delete trailing newline */
+ in_line[in_line_len - 1] = '\0';
+
+ if (in_line_len > 5
+ && !memcmp(in_line, "~~~~", 4)) {
+ /* Title */
+ char **cur_title = bsearch(&in_line[4], title,
+ titles, sizeof(title[0]),
+ cmpstring_p_pp);
+
+ if (!cur_title) {
+ printf("TITLE NOT FOUND: %s\n", in_line);
+ assert(cur_title);
+ }
+
+ i = cur_title - title;
+ //printf("%s\n", title[i]);
+
+ link_titles_done++;
+ if (0 == (link_titles_done % 100000)) {
+ printf("%d\n", link_titles_done);
+ }
+ } else {
+ /* Link */
+
+ /* Delete trailing anchor */
+ strtok(in_line, "#");
+
+ char **cur_link = bsearch(&in_line[0], title,
+ titles, sizeof(title[0]),
+ cmpstring_p_pp);
+
+ if (!cur_link) {
+ //printf("LINK NOT FOUND: %s\n", in_line);
+ free(in_line);
+ continue;
+ }
+
+ links[i]++;
+ link[i] = realloc(link[i], links[i] * sizeof(art_id));
+
+ link[i][links[i] - 1] = cur_link - title;
+
+ //printf("%s -- %d\n", title[i], links[i]);
+ }
+
+ free(in_line);
+ }
+ fclose(in_file);
+
+ printf("Links identified.\n");
+
+
+
+ out_file = fopen("titles-sorted.txt", "w");
+ for (i = 0; i < titles; i++) {
+ fputs(title[i], out_file);
+ fputs("\n", out_file);
+ }
+ fclose(out_file);
+
+ printf("Titles written.\n");
+
+
+
+ out_file = fopen("links-outgoing.bin", "wb");
+ fwrite(&titles, sizeof(titles), 1, out_file);
+ for (i = 0; i < titles; i++) {
+ art_id j;
+
+ fwrite(&links[i], sizeof(links[i]), 1, out_file);
+
+ for (j = 0; j < links[i]; j++) {
+ fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
+ }
+ }
+ fclose(out_file);
+
+ printf("Links written.\n");
+}
--- /dev/null
+%%
+\<d\> ;
+\<\/d\> ;
+\<p> ;
+\<\/p\> printf("\n");
+\<t> printf("~~~~");
+\<\/t\> printf("\n");
+\<l> ;
+\<\/l\> printf("\n");
+%%
\ No newline at end of file
--- /dev/null
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
+
+
+typedef __uint32_t bitfield_type;
+#define BITS_PER_BITFIELD 32
+#define BITS_PER_BITFIELD_LOG 5
+
+#define BIT_SET(set, d) \
+ ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) )
+#define BIT_ISSET(set, d) \
+ (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) ))
+
+int ANY_BITS_SET(bitfield_type *set, size_t elems)
+{
+ size_t i;
+
+ for (i = 0; i < elems; i++)
+ if (set[i] != 0)
+ return 1;
+
+ return 0;
+}
+
+
+typedef __uint32_t art_id;
+
+
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+ return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main(int argc, char **argv)
+{
+ FILE *in_file;
+ FILE *out_file;
+
+ char *title_blob;
+ size_t title_blob_len;
+ char **title = NULL;
+ art_id titles;
+ art_id titles_read = 0;
+
+ art_id *link_blob;
+ size_t link_blob_len;
+ art_id **linki;
+ art_id *linkis;
+
+ char **cur_title;
+ art_id title_id;
+
+ art_id cur_dist;
+
+ art_id i;
+
+ bitfield_type *titles_seen;
+ bitfield_type *titles_prev_round;
+ bitfield_type *titles_this_round;
+ size_t titles_bitfield_elems;
+ size_t titles_bitfield_bytes;
+
+
+ if (argc < 2) {
+ printf("Supply article name to look up articles linking to it.\n");
+ return 1;
+ }
+
+
+ /*
+ * Read all incoming links into memory
+ */
+
+ in_file = fopen("links-incoming.bin", "rb");
+ fseek(in_file, 0, SEEK_END);
+ link_blob_len = ftell(in_file);
+ rewind(in_file);
+ printf("Link blob size: %zd bytes.\n", link_blob_len);
+
+ link_blob = malloc(link_blob_len);
+ if (!link_blob) {
+ printf("Failed to allocate memory.\n");
+ return 1;
+ }
+
+ if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) {
+ printf("Failed to read entire file in one go.\n");
+ return 1;
+ }
+ printf("Link blob read (%zd bytes).\n", ftell(in_file));
+ fclose(in_file);
+
+ titles = *link_blob;
+ link_blob++;
+
+ linki = malloc(titles * sizeof(art_id*));
+ linkis = malloc(titles * sizeof(art_id));
+
+ for (i = 0; i < titles; i++) {
+ art_id j;
+
+ linkis[i] = *link_blob;
+ link_blob++;
+
+ linki[i] = link_blob;
+ link_blob += linkis[i];
+ }
+ printf("Incoming links prepared.\n");
+
+
+
+
+
+ /*
+ * Read all titles into memory
+ */
+
+
+
+ in_file = fopen("titles-sorted.txt", "r");
+ fseek(in_file, 0, SEEK_END);
+ title_blob_len = ftell(in_file);
+ rewind(in_file);
+ printf("Title blob size: %zd bytes.\n", title_blob_len);
+
+ title_blob = malloc(title_blob_len + 1);
+ if (!title_blob) {
+ printf("Failed to allocate memory.\n");
+ return 1;
+ }
+
+ if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) {
+ printf("Failed to read entire file in one go.\n");
+ return 1;
+ }
+ printf("Title blob read (%zd bytes).\n", ftell(in_file));
+ fclose(in_file);
+
+ title_blob[title_blob_len] = '\0';
+
+ title = malloc((titles + 1) * sizeof(title[0]));
+
+ title[0] = title_blob;
+ titles_read = 1;
+ while (title_blob = strchr(title_blob, '\n')) {
+ title_blob[0] = '\0';
+ title_blob++;
+ title[titles_read] = title_blob;
+ titles_read++;
+ }
+
+ /* Last title will be an empty string, as the file ends with a newline. */
+ printf("Titles prepared: %d of %d.\n", titles_read - 1, titles);
+
+
+
+
+ /* Look up article ID */
+ cur_title = bsearch(argv[1], title,
+ titles, sizeof(title[0]),
+ cmpstring_p_pp);
+
+ if (!cur_title) {
+ printf("TITLE NOT FOUND: %s\n", argv[1]);
+ return 1;
+ }
+
+ title_id = cur_title - title;
+
+
+ #if 0
+ /* Print first degree incoming edges */
+ printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
+ for (i = 0; i < linkis[title_id]; i++) {
+ art_id x = linki[title_id][i];
+ printf(" %s\n", title[x]);
+ }
+ #endif
+
+
+
+ printf("\n\n\n\nBuilding table of distances...\n\n");
+
+ titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD);
+ titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type);
+
+BENCHMARK:
+
+ titles_seen = calloc(titles_bitfield_bytes, 1);
+ titles_prev_round = NULL;
+ titles_this_round = calloc(titles_bitfield_bytes, 1);
+
+ cur_dist = 0;
+
+ BIT_SET(titles_seen, title_id);
+ BIT_SET(titles_this_round, title_id);
+
+ while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) {
+ art_id i;
+
+ cur_dist++;
+ printf("Checking distance %d\n", cur_dist);
+
+ free(titles_prev_round); /* free is safe to call on NULL */
+ titles_prev_round = titles_this_round;
+ titles_this_round = calloc(titles_bitfield_bytes, 1);
+
+ for (i = 0; i < titles; i++) {
+ int j;
+
+ if (!BIT_ISSET(titles_prev_round, i))
+ continue;
+
+ for (j = 0; j < linkis[i]; j++) {
+ art_id x = linki[i][j];
+ BIT_SET(titles_this_round, x);
+ }
+ }
+
+ for (i = 0; i < titles_bitfield_elems; i++) {
+ titles_this_round[i] &= ~titles_seen[i];
+ titles_seen[i] |= titles_this_round[i];
+ }
+ }
+
+ /* Result is in titles_prev_round */
+
+ cur_dist -= 1;
+
+ printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
+ for (i = 0; i < titles; i++) {
+ if (BIT_ISSET(titles_prev_round, i)) {
+ printf(" %s\n", title[i]);
+ }
+ }
+
+
+ free(titles_seen);
+ free(titles_prev_round);
+ free(titles_this_round);
+
+ //goto BENCHMARK;
+
+ return 0;
+}