From: norly Date: Sun, 14 Jul 2019 15:28:47 +0000 (+0200) Subject: Move to Makefile and .gitignore X-Git-Url: https://git.enpas.org/?p=enwiki-links-graph.git;a=commitdiff_plain;h=f0f54296b5b445c6ce0e47486bcdcb0deca582ff Move to Makefile and .gitignore --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..20bdc9e --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +bin/ +enwiki-links-plain.txt +titles.txt +titles-sorted.txt +links-outgoing.bin +links-incoming.bin diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0182eff --- /dev/null +++ b/Makefile @@ -0,0 +1,61 @@ +GCC=gcc -O3 -g +BINDIR=bin +SRCDIR=src + +all: lookuptools preptools dbfiles + + +$(BINDIR): + mkdir -p $(BINDIR) + + +lookuptools: $(BINDIR)/lookup-incoming + +$(BINDIR)/lookup-incoming: $(SRCDIR)/lookup-incoming.c + mkdir -p $(BINDIR) + $(GCC) -o $@ $< + + + +preptools: $(BINDIR)/links-xml-to-plain \ + $(BINDIR)/links-outgoing-to-incoming \ + $(BINDIR)/links-plain-to-binary + +$(BINDIR)/links-xml-to-plain: $(SRCDIR)/links-xml-to-plain.l + mkdir -p $(BINDIR) + # Compile flex program to regex-convert the XML to plain text + #https://unix.stackexchange.com/a/413684 + flex -o $(BINDIR)/links-xml-to-plain.c $^ + $(GCC) -o $@ $(BINDIR)/links-xml-to-plain.c -lfl + +$(BINDIR)/links-plain-to-binary: $(SRCDIR)/links-plain-to-binary.c titles.txt + mkdir -p $(BINDIR) + $(GCC) -o $@ $< + +$(BINDIR)/links-outgoing-to-incoming: $(SRCDIR)/links-outgoing-to-incoming.c + mkdir -p $(BINDIR) + $(GCC) -o $@ $< + + + +dbfiles: links-incoming.bin titles-sorted.txt + +enwiki-links-plain.txt: $(BINDIR)/links-xml-to-plain enwiki-links.xml.lzo + # Convert to plain text + lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | $(BINDIR)/links-xml-to-plain > enwiki-links-plain.txt + +titles.txt: enwiki-links-plain.txt + # Extract titles + grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt + +links-outgoing.bin: $(BINDIR)/links-plain-to-binary titles.txt + # This also produces titles-sorted.txt + $(BINDIR)/links-plain-to-binary + # Update timestamp so make does not rebuild links-outgoing.bin + touch titles-sorted.txt + +titles-sorted.txt: links-outgoing.bin + # Generated in the same step as links-outgoing.bin + +links-incoming.bin: $(BINDIR)/links-outgoing-to-incoming links-outgoing.bin + $(BINDIR)/links-outgoing-to-incoming diff --git a/convert-to-plain.sh b/convert-to-plain.sh deleted file mode 100644 index e7bd378..0000000 --- a/convert-to-plain.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Compile flex program to regex-convert the XML to plain text -#https://unix.stackexchange.com/a/413684 -flex -o links-xml-to-plain.c links-xml-to-plain.l -gcc -O3 -o links-xml-to-plain links-xml-to-plain.c -lfl - -# Convert to plain text -lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt - -# Extract titles -grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt - - - -gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c -time ./links-plain-to-binary - -gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c -time ./links-outgoing-to-incoming diff --git a/links-outgoing-to-incoming.c b/links-outgoing-to-incoming.c deleted file mode 100644 index 2fd853a..0000000 --- a/links-outgoing-to-incoming.c +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -typedef __uint32_t art_id; - - -int main() -{ - FILE *in_file; - FILE *out_file; - - art_id titles; - - art_id **linko; - art_id *linkos; - - art_id **linki; - art_id *linkis; - - art_id link_titles_done = 0; - - art_id i; - - - /* - * Read all outgoing links into memory - */ - - in_file = fopen("links-outgoing.bin", "rb"); - - fread(&titles, sizeof(titles), 1, in_file); - - linko = malloc(titles * sizeof(art_id*)); - linkos = malloc(titles * sizeof(art_id)); - - for (i = 0; i < titles; i++) { - art_id j; - - fread(&linkos[i], sizeof(linkos[i]), 1, in_file); - - linko[i] = malloc(linkos[i] * sizeof(linko[i][0])); - - for (j = 0; j < linkos[i]; j++) { - fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file); - } - } - fclose(in_file); - - printf("Outgoing links read.\n"); - - - - linki = malloc(titles * sizeof(art_id*)); - linkis = malloc(titles * sizeof(art_id)); - - for (i = 0; i < titles; i++) { - art_id j; - - for (j = 0; j < linkos[i]; j++) { - art_id x = linko[i][j]; - - linkis[x]++; - linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0])); - - linki[x][linkis[x] - 1] = i; - } - } - - printf("Links turned upside down.\n"); - - - - out_file = fopen("links-incoming.bin", "wb"); - fwrite(&titles, sizeof(titles), 1, out_file); - for (i = 0; i < titles; i++) { - art_id j; - - fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file); - - for (j = 0; j < linkis[i]; j++) { - fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file); - } - } - fclose(out_file); - - printf("Incoming links written.\n"); -} diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c deleted file mode 100644 index 005a496..0000000 --- a/links-plain-to-binary.c +++ /dev/null @@ -1,178 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -typedef __uint32_t art_id; - - -static int -cmpstring_pp_pp(const void *p1, const void *p2) -{ - return strcasecmp(* (char * const *) p1, * (char * const *) p2); -} - -static int -cmpstring_p_pp(const void *p1, const void *p2) -{ - return strcasecmp((char * const) p1, * (char * const *) p2); -} - - - - -int main() -{ - FILE *in_file; - FILE *out_file; - - char **title = NULL; - art_id titles = 0; - art_id titles_alloc = 0; - - art_id **link; - art_id *links; - - art_id link_titles_done = 0; - - art_id i; - - - /* - * Read all titles into memory - */ - - in_file = fopen("titles.txt", "r"); - while (!feof(in_file)) { - char *in_line = NULL; - ssize_t in_line_len = 0; - size_t zero = 0; - - if (titles == titles_alloc) { - titles_alloc += 100000; - title = realloc(title, titles_alloc * sizeof(title[0])); - } - - in_line_len = getline(&in_line, &zero, in_file); - - /* Ignore empty lines and errors */ - if (in_line_len < 2) { - continue; - } - - /* Delete trailing newline */ - in_line[in_line_len - 1] = '\0'; - - title[titles] = in_line; - titles++; - } - fclose(in_file); - - qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp); - - printf("Sorting done.\n"); - - - - link = malloc(titles * sizeof(art_id*)); - links = malloc(titles * sizeof(art_id)); - - in_file = fopen("enwiki-links-plain.txt", "r"); - while (!feof(in_file)) { - char *in_line = NULL; - ssize_t in_line_len = 0; - size_t zero = 0; - - in_line_len = getline(&in_line, &zero, in_file); - - /* Ignore empty lines and errors */ - if (in_line_len < 2) { - //printf("%d\n", links[i]); - if (in_line) - free(in_line); - continue; - } - - /* Delete trailing newline */ - in_line[in_line_len - 1] = '\0'; - - if (in_line_len > 5 - && !memcmp(in_line, "~~~~", 4)) { - /* Title */ - char **cur_title = bsearch(&in_line[4], title, - titles, sizeof(title[0]), - cmpstring_p_pp); - - if (!cur_title) { - printf("TITLE NOT FOUND: %s\n", in_line); - assert(cur_title); - } - - i = cur_title - title; - //printf("%s\n", title[i]); - - link_titles_done++; - if (0 == (link_titles_done % 100000)) { - printf("%d\n", link_titles_done); - } - } else { - /* Link */ - - /* Delete trailing anchor */ - strtok(in_line, "#"); - - char **cur_link = bsearch(&in_line[0], title, - titles, sizeof(title[0]), - cmpstring_p_pp); - - if (!cur_link) { - //printf("LINK NOT FOUND: %s\n", in_line); - free(in_line); - continue; - } - - links[i]++; - link[i] = realloc(link[i], links[i] * sizeof(art_id)); - - link[i][links[i] - 1] = cur_link - title; - - //printf("%s -- %d\n", title[i], links[i]); - } - - free(in_line); - } - fclose(in_file); - - printf("Links identified.\n"); - - - - out_file = fopen("titles-sorted.txt", "w"); - for (i = 0; i < titles; i++) { - fputs(title[i], out_file); - fputs("\n", out_file); - } - fclose(out_file); - - printf("Titles written.\n"); - - - - out_file = fopen("links-outgoing.bin", "wb"); - fwrite(&titles, sizeof(titles), 1, out_file); - for (i = 0; i < titles; i++) { - art_id j; - - fwrite(&links[i], sizeof(links[i]), 1, out_file); - - for (j = 0; j < links[i]; j++) { - fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file); - } - } - fclose(out_file); - - printf("Links written.\n"); -} diff --git a/links-xml-to-plain.l b/links-xml-to-plain.l deleted file mode 100644 index dabe538..0000000 --- a/links-xml-to-plain.l +++ /dev/null @@ -1,10 +0,0 @@ -%% -\ ; -\<\/d\> ; -\

; -\<\/p\> printf("\n"); -\ printf("~~~~"); -\<\/t\> printf("\n"); -\ ; -\<\/l\> printf("\n"); -%% \ No newline at end of file diff --git a/lookup-incoming.c b/lookup-incoming.c deleted file mode 100644 index 2d3fa9c..0000000 --- a/lookup-incoming.c +++ /dev/null @@ -1,257 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) - - -typedef __uint32_t bitfield_type; -#define BITS_PER_BITFIELD 32 -#define BITS_PER_BITFIELD_LOG 5 - -#define BIT_SET(set, d) \ - ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) ) -#define BIT_ISSET(set, d) \ - (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) )) - -int ANY_BITS_SET(bitfield_type *set, size_t elems) -{ - size_t i; - - for (i = 0; i < elems; i++) - if (set[i] != 0) - return 1; - - return 0; -} - - -typedef __uint32_t art_id; - - - -static int -cmpstring_p_pp(const void *p1, const void *p2) -{ - return strcasecmp((char * const) p1, * (char * const *) p2); -} - - - - -int main(int argc, char **argv) -{ - FILE *in_file; - FILE *out_file; - - char *title_blob; - size_t title_blob_len; - char **title = NULL; - art_id titles; - art_id titles_read = 0; - - art_id *link_blob; - size_t link_blob_len; - art_id **linki; - art_id *linkis; - - char **cur_title; - art_id title_id; - - art_id cur_dist; - - art_id i; - - bitfield_type *titles_seen; - bitfield_type *titles_prev_round; - bitfield_type *titles_this_round; - size_t titles_bitfield_elems; - size_t titles_bitfield_bytes; - - - if (argc < 2) { - printf("Supply article name to look up articles linking to it.\n"); - return 1; - } - - - /* - * Read all incoming links into memory - */ - - in_file = fopen("links-incoming.bin", "rb"); - fseek(in_file, 0, SEEK_END); - link_blob_len = ftell(in_file); - rewind(in_file); - printf("Link blob size: %zd bytes.\n", link_blob_len); - - link_blob = malloc(link_blob_len); - if (!link_blob) { - printf("Failed to allocate memory.\n"); - return 1; - } - - if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) { - printf("Failed to read entire file in one go.\n"); - return 1; - } - printf("Link blob read (%zd bytes).\n", ftell(in_file)); - fclose(in_file); - - titles = *link_blob; - link_blob++; - - linki = malloc(titles * sizeof(art_id*)); - linkis = malloc(titles * sizeof(art_id)); - - for (i = 0; i < titles; i++) { - art_id j; - - linkis[i] = *link_blob; - link_blob++; - - linki[i] = link_blob; - link_blob += linkis[i]; - } - printf("Incoming links prepared.\n"); - - - - - - /* - * Read all titles into memory - */ - - - - in_file = fopen("titles-sorted.txt", "r"); - fseek(in_file, 0, SEEK_END); - title_blob_len = ftell(in_file); - rewind(in_file); - printf("Title blob size: %zd bytes.\n", title_blob_len); - - title_blob = malloc(title_blob_len + 1); - if (!title_blob) { - printf("Failed to allocate memory.\n"); - return 1; - } - - if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) { - printf("Failed to read entire file in one go.\n"); - return 1; - } - printf("Title blob read (%zd bytes).\n", ftell(in_file)); - fclose(in_file); - - title_blob[title_blob_len] = '\0'; - - title = malloc((titles + 1) * sizeof(title[0])); - - title[0] = title_blob; - titles_read = 1; - while (title_blob = strchr(title_blob, '\n')) { - title_blob[0] = '\0'; - title_blob++; - title[titles_read] = title_blob; - titles_read++; - } - - /* Last title will be an empty string, as the file ends with a newline. */ - printf("Titles prepared: %d of %d.\n", titles_read - 1, titles); - - - - - /* Look up article ID */ - cur_title = bsearch(argv[1], title, - titles, sizeof(title[0]), - cmpstring_p_pp); - - if (!cur_title) { - printf("TITLE NOT FOUND: %s\n", argv[1]); - return 1; - } - - title_id = cur_title - title; - - - #if 0 - /* Print first degree incoming edges */ - printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]); - for (i = 0; i < linkis[title_id]; i++) { - art_id x = linki[title_id][i]; - printf(" %s\n", title[x]); - } - #endif - - - - printf("\n\n\n\nBuilding table of distances...\n\n"); - - titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD); - titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type); - -BENCHMARK: - - titles_seen = calloc(titles_bitfield_bytes, 1); - titles_prev_round = NULL; - titles_this_round = calloc(titles_bitfield_bytes, 1); - - cur_dist = 0; - - BIT_SET(titles_seen, title_id); - BIT_SET(titles_this_round, title_id); - - while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) { - art_id i; - - cur_dist++; - printf("Checking distance %d\n", cur_dist); - - free(titles_prev_round); /* free is safe to call on NULL */ - titles_prev_round = titles_this_round; - titles_this_round = calloc(titles_bitfield_bytes, 1); - - for (i = 0; i < titles; i++) { - int j; - - if (!BIT_ISSET(titles_prev_round, i)) - continue; - - for (j = 0; j < linkis[i]; j++) { - art_id x = linki[i][j]; - BIT_SET(titles_this_round, x); - } - } - - for (i = 0; i < titles_bitfield_elems; i++) { - titles_this_round[i] &= ~titles_seen[i]; - titles_seen[i] |= titles_this_round[i]; - } - } - - /* Result is in titles_prev_round */ - - cur_dist -= 1; - - printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist); - for (i = 0; i < titles; i++) { - if (BIT_ISSET(titles_prev_round, i)) { - printf(" %s\n", title[i]); - } - } - - - free(titles_seen); - free(titles_prev_round); - free(titles_this_round); - - //goto BENCHMARK; - - return 0; -} diff --git a/src/links-outgoing-to-incoming.c b/src/links-outgoing-to-incoming.c new file mode 100644 index 0000000..2fd853a --- /dev/null +++ b/src/links-outgoing-to-incoming.c @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include +#include + + +typedef __uint32_t art_id; + + +int main() +{ + FILE *in_file; + FILE *out_file; + + art_id titles; + + art_id **linko; + art_id *linkos; + + art_id **linki; + art_id *linkis; + + art_id link_titles_done = 0; + + art_id i; + + + /* + * Read all outgoing links into memory + */ + + in_file = fopen("links-outgoing.bin", "rb"); + + fread(&titles, sizeof(titles), 1, in_file); + + linko = malloc(titles * sizeof(art_id*)); + linkos = malloc(titles * sizeof(art_id)); + + for (i = 0; i < titles; i++) { + art_id j; + + fread(&linkos[i], sizeof(linkos[i]), 1, in_file); + + linko[i] = malloc(linkos[i] * sizeof(linko[i][0])); + + for (j = 0; j < linkos[i]; j++) { + fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file); + } + } + fclose(in_file); + + printf("Outgoing links read.\n"); + + + + linki = malloc(titles * sizeof(art_id*)); + linkis = malloc(titles * sizeof(art_id)); + + for (i = 0; i < titles; i++) { + art_id j; + + for (j = 0; j < linkos[i]; j++) { + art_id x = linko[i][j]; + + linkis[x]++; + linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0])); + + linki[x][linkis[x] - 1] = i; + } + } + + printf("Links turned upside down.\n"); + + + + out_file = fopen("links-incoming.bin", "wb"); + fwrite(&titles, sizeof(titles), 1, out_file); + for (i = 0; i < titles; i++) { + art_id j; + + fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file); + + for (j = 0; j < linkis[i]; j++) { + fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file); + } + } + fclose(out_file); + + printf("Incoming links written.\n"); +} diff --git a/src/links-plain-to-binary.c b/src/links-plain-to-binary.c new file mode 100644 index 0000000..005a496 --- /dev/null +++ b/src/links-plain-to-binary.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include + + +typedef __uint32_t art_id; + + +static int +cmpstring_pp_pp(const void *p1, const void *p2) +{ + return strcasecmp(* (char * const *) p1, * (char * const *) p2); +} + +static int +cmpstring_p_pp(const void *p1, const void *p2) +{ + return strcasecmp((char * const) p1, * (char * const *) p2); +} + + + + +int main() +{ + FILE *in_file; + FILE *out_file; + + char **title = NULL; + art_id titles = 0; + art_id titles_alloc = 0; + + art_id **link; + art_id *links; + + art_id link_titles_done = 0; + + art_id i; + + + /* + * Read all titles into memory + */ + + in_file = fopen("titles.txt", "r"); + while (!feof(in_file)) { + char *in_line = NULL; + ssize_t in_line_len = 0; + size_t zero = 0; + + if (titles == titles_alloc) { + titles_alloc += 100000; + title = realloc(title, titles_alloc * sizeof(title[0])); + } + + in_line_len = getline(&in_line, &zero, in_file); + + /* Ignore empty lines and errors */ + if (in_line_len < 2) { + continue; + } + + /* Delete trailing newline */ + in_line[in_line_len - 1] = '\0'; + + title[titles] = in_line; + titles++; + } + fclose(in_file); + + qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp); + + printf("Sorting done.\n"); + + + + link = malloc(titles * sizeof(art_id*)); + links = malloc(titles * sizeof(art_id)); + + in_file = fopen("enwiki-links-plain.txt", "r"); + while (!feof(in_file)) { + char *in_line = NULL; + ssize_t in_line_len = 0; + size_t zero = 0; + + in_line_len = getline(&in_line, &zero, in_file); + + /* Ignore empty lines and errors */ + if (in_line_len < 2) { + //printf("%d\n", links[i]); + if (in_line) + free(in_line); + continue; + } + + /* Delete trailing newline */ + in_line[in_line_len - 1] = '\0'; + + if (in_line_len > 5 + && !memcmp(in_line, "~~~~", 4)) { + /* Title */ + char **cur_title = bsearch(&in_line[4], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_title) { + printf("TITLE NOT FOUND: %s\n", in_line); + assert(cur_title); + } + + i = cur_title - title; + //printf("%s\n", title[i]); + + link_titles_done++; + if (0 == (link_titles_done % 100000)) { + printf("%d\n", link_titles_done); + } + } else { + /* Link */ + + /* Delete trailing anchor */ + strtok(in_line, "#"); + + char **cur_link = bsearch(&in_line[0], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_link) { + //printf("LINK NOT FOUND: %s\n", in_line); + free(in_line); + continue; + } + + links[i]++; + link[i] = realloc(link[i], links[i] * sizeof(art_id)); + + link[i][links[i] - 1] = cur_link - title; + + //printf("%s -- %d\n", title[i], links[i]); + } + + free(in_line); + } + fclose(in_file); + + printf("Links identified.\n"); + + + + out_file = fopen("titles-sorted.txt", "w"); + for (i = 0; i < titles; i++) { + fputs(title[i], out_file); + fputs("\n", out_file); + } + fclose(out_file); + + printf("Titles written.\n"); + + + + out_file = fopen("links-outgoing.bin", "wb"); + fwrite(&titles, sizeof(titles), 1, out_file); + for (i = 0; i < titles; i++) { + art_id j; + + fwrite(&links[i], sizeof(links[i]), 1, out_file); + + for (j = 0; j < links[i]; j++) { + fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file); + } + } + fclose(out_file); + + printf("Links written.\n"); +} diff --git a/src/links-xml-to-plain.l b/src/links-xml-to-plain.l new file mode 100644 index 0000000..dabe538 --- /dev/null +++ b/src/links-xml-to-plain.l @@ -0,0 +1,10 @@ +%% +\ ; +\<\/d\> ; +\

; +\<\/p\> printf("\n"); +\ printf("~~~~"); +\<\/t\> printf("\n"); +\ ; +\<\/l\> printf("\n"); +%% \ No newline at end of file diff --git a/src/lookup-incoming.c b/src/lookup-incoming.c new file mode 100644 index 0000000..2d3fa9c --- /dev/null +++ b/src/lookup-incoming.c @@ -0,0 +1,257 @@ +#include +#include +#include +#include +#include +#include + + +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) + + +typedef __uint32_t bitfield_type; +#define BITS_PER_BITFIELD 32 +#define BITS_PER_BITFIELD_LOG 5 + +#define BIT_SET(set, d) \ + ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) ) +#define BIT_ISSET(set, d) \ + (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) )) + +int ANY_BITS_SET(bitfield_type *set, size_t elems) +{ + size_t i; + + for (i = 0; i < elems; i++) + if (set[i] != 0) + return 1; + + return 0; +} + + +typedef __uint32_t art_id; + + + +static int +cmpstring_p_pp(const void *p1, const void *p2) +{ + return strcasecmp((char * const) p1, * (char * const *) p2); +} + + + + +int main(int argc, char **argv) +{ + FILE *in_file; + FILE *out_file; + + char *title_blob; + size_t title_blob_len; + char **title = NULL; + art_id titles; + art_id titles_read = 0; + + art_id *link_blob; + size_t link_blob_len; + art_id **linki; + art_id *linkis; + + char **cur_title; + art_id title_id; + + art_id cur_dist; + + art_id i; + + bitfield_type *titles_seen; + bitfield_type *titles_prev_round; + bitfield_type *titles_this_round; + size_t titles_bitfield_elems; + size_t titles_bitfield_bytes; + + + if (argc < 2) { + printf("Supply article name to look up articles linking to it.\n"); + return 1; + } + + + /* + * Read all incoming links into memory + */ + + in_file = fopen("links-incoming.bin", "rb"); + fseek(in_file, 0, SEEK_END); + link_blob_len = ftell(in_file); + rewind(in_file); + printf("Link blob size: %zd bytes.\n", link_blob_len); + + link_blob = malloc(link_blob_len); + if (!link_blob) { + printf("Failed to allocate memory.\n"); + return 1; + } + + if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) { + printf("Failed to read entire file in one go.\n"); + return 1; + } + printf("Link blob read (%zd bytes).\n", ftell(in_file)); + fclose(in_file); + + titles = *link_blob; + link_blob++; + + linki = malloc(titles * sizeof(art_id*)); + linkis = malloc(titles * sizeof(art_id)); + + for (i = 0; i < titles; i++) { + art_id j; + + linkis[i] = *link_blob; + link_blob++; + + linki[i] = link_blob; + link_blob += linkis[i]; + } + printf("Incoming links prepared.\n"); + + + + + + /* + * Read all titles into memory + */ + + + + in_file = fopen("titles-sorted.txt", "r"); + fseek(in_file, 0, SEEK_END); + title_blob_len = ftell(in_file); + rewind(in_file); + printf("Title blob size: %zd bytes.\n", title_blob_len); + + title_blob = malloc(title_blob_len + 1); + if (!title_blob) { + printf("Failed to allocate memory.\n"); + return 1; + } + + if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) { + printf("Failed to read entire file in one go.\n"); + return 1; + } + printf("Title blob read (%zd bytes).\n", ftell(in_file)); + fclose(in_file); + + title_blob[title_blob_len] = '\0'; + + title = malloc((titles + 1) * sizeof(title[0])); + + title[0] = title_blob; + titles_read = 1; + while (title_blob = strchr(title_blob, '\n')) { + title_blob[0] = '\0'; + title_blob++; + title[titles_read] = title_blob; + titles_read++; + } + + /* Last title will be an empty string, as the file ends with a newline. */ + printf("Titles prepared: %d of %d.\n", titles_read - 1, titles); + + + + + /* Look up article ID */ + cur_title = bsearch(argv[1], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_title) { + printf("TITLE NOT FOUND: %s\n", argv[1]); + return 1; + } + + title_id = cur_title - title; + + + #if 0 + /* Print first degree incoming edges */ + printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]); + for (i = 0; i < linkis[title_id]; i++) { + art_id x = linki[title_id][i]; + printf(" %s\n", title[x]); + } + #endif + + + + printf("\n\n\n\nBuilding table of distances...\n\n"); + + titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD); + titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type); + +BENCHMARK: + + titles_seen = calloc(titles_bitfield_bytes, 1); + titles_prev_round = NULL; + titles_this_round = calloc(titles_bitfield_bytes, 1); + + cur_dist = 0; + + BIT_SET(titles_seen, title_id); + BIT_SET(titles_this_round, title_id); + + while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) { + art_id i; + + cur_dist++; + printf("Checking distance %d\n", cur_dist); + + free(titles_prev_round); /* free is safe to call on NULL */ + titles_prev_round = titles_this_round; + titles_this_round = calloc(titles_bitfield_bytes, 1); + + for (i = 0; i < titles; i++) { + int j; + + if (!BIT_ISSET(titles_prev_round, i)) + continue; + + for (j = 0; j < linkis[i]; j++) { + art_id x = linki[i][j]; + BIT_SET(titles_this_round, x); + } + } + + for (i = 0; i < titles_bitfield_elems; i++) { + titles_this_round[i] &= ~titles_seen[i]; + titles_seen[i] |= titles_this_round[i]; + } + } + + /* Result is in titles_prev_round */ + + cur_dist -= 1; + + printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist); + for (i = 0; i < titles; i++) { + if (BIT_ISSET(titles_prev_round, i)) { + printf(" %s\n", title[i]); + } + } + + + free(titles_seen); + free(titles_prev_round); + free(titles_this_round); + + //goto BENCHMARK; + + return 0; +}