Move to Makefile and .gitignore
[enwiki-links-graph.git] / src / lookup-incoming.c
diff --git a/src/lookup-incoming.c b/src/lookup-incoming.c
new file mode 100644 (file)
index 0000000..2d3fa9c
--- /dev/null
@@ -0,0 +1,257 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
+
+
+typedef __uint32_t bitfield_type;
+#define BITS_PER_BITFIELD 32
+#define BITS_PER_BITFIELD_LOG 5
+
+#define BIT_SET(set, d) \
+  ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) )
+#define BIT_ISSET(set, d) \
+  (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) ))
+
+int ANY_BITS_SET(bitfield_type *set, size_t elems)
+{
+       size_t i;
+
+       for (i = 0; i < elems; i++)
+               if (set[i] != 0)
+                       return 1;
+
+       return 0;
+}
+
+
+typedef __uint32_t art_id;
+
+
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+       return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main(int argc, char **argv)
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       char *title_blob;
+       size_t title_blob_len;
+       char **title = NULL;
+       art_id titles;
+       art_id titles_read = 0;
+
+       art_id *link_blob;
+       size_t link_blob_len;
+       art_id **linki;
+       art_id *linkis;
+
+       char **cur_title;
+       art_id title_id;
+
+       art_id cur_dist;
+
+       art_id i;
+
+       bitfield_type *titles_seen;
+       bitfield_type *titles_prev_round;
+       bitfield_type *titles_this_round;
+       size_t titles_bitfield_elems;
+       size_t titles_bitfield_bytes;
+
+
+       if (argc < 2) {
+               printf("Supply article name to look up articles linking to it.\n");
+               return 1;
+       }
+
+
+       /*
+        * Read all incoming links into memory
+        */
+
+       in_file = fopen("links-incoming.bin", "rb");
+       fseek(in_file, 0, SEEK_END);
+       link_blob_len = ftell(in_file);
+       rewind(in_file);
+       printf("Link blob size: %zd bytes.\n", link_blob_len);
+
+       link_blob = malloc(link_blob_len);
+       if (!link_blob) {
+               printf("Failed to allocate memory.\n");
+               return 1;
+       }
+
+       if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) {
+               printf("Failed to read entire file in one go.\n");
+               return 1;
+       }
+       printf("Link blob read (%zd bytes).\n", ftell(in_file));
+       fclose(in_file);
+
+       titles = *link_blob;
+       link_blob++;
+
+       linki = malloc(titles * sizeof(art_id*));
+       linkis = malloc(titles * sizeof(art_id));
+
+       for (i = 0; i < titles; i++) {
+               art_id j;
+
+               linkis[i] = *link_blob;
+               link_blob++;
+
+               linki[i] = link_blob;
+               link_blob += linkis[i];
+       }
+       printf("Incoming links prepared.\n");
+
+
+
+
+
+       /*
+        * Read all titles into memory
+        */
+
+
+
+       in_file = fopen("titles-sorted.txt", "r");
+       fseek(in_file, 0, SEEK_END);
+       title_blob_len = ftell(in_file);
+       rewind(in_file);
+       printf("Title blob size: %zd bytes.\n", title_blob_len);
+
+       title_blob = malloc(title_blob_len + 1);
+       if (!title_blob) {
+               printf("Failed to allocate memory.\n");
+               return 1;
+       }
+
+       if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) {
+               printf("Failed to read entire file in one go.\n");
+               return 1;
+       }
+       printf("Title blob read (%zd bytes).\n", ftell(in_file));
+       fclose(in_file);
+
+       title_blob[title_blob_len] = '\0';
+
+       title = malloc((titles + 1) * sizeof(title[0]));
+
+       title[0] = title_blob;
+       titles_read = 1;
+       while (title_blob = strchr(title_blob, '\n')) {
+               title_blob[0] = '\0';
+               title_blob++;
+               title[titles_read] = title_blob;
+               titles_read++;
+       }
+
+       /* Last title will be an empty string, as the file ends with a newline. */
+       printf("Titles prepared: %d of %d.\n", titles_read - 1, titles);
+
+
+
+
+       /* Look up article ID */
+       cur_title = bsearch(argv[1], title,
+                               titles, sizeof(title[0]),
+                               cmpstring_p_pp);
+
+       if (!cur_title) {
+               printf("TITLE NOT FOUND: %s\n", argv[1]);
+               return 1;
+       }
+
+       title_id = cur_title - title;
+
+
+       #if 0
+       /* Print first degree incoming edges */
+       printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
+       for (i = 0; i < linkis[title_id]; i++) {
+               art_id x = linki[title_id][i];
+               printf("  %s\n", title[x]);
+       }
+       #endif
+
+
+
+       printf("\n\n\n\nBuilding table of distances...\n\n");
+
+       titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD);
+       titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type);
+
+BENCHMARK:
+
+       titles_seen = calloc(titles_bitfield_bytes, 1);
+       titles_prev_round = NULL;
+       titles_this_round = calloc(titles_bitfield_bytes, 1);
+
+       cur_dist = 0;
+
+       BIT_SET(titles_seen, title_id);
+       BIT_SET(titles_this_round, title_id);
+
+       while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) {
+               art_id i;
+
+               cur_dist++;
+               printf("Checking distance %d\n", cur_dist);
+
+               free(titles_prev_round); /* free is safe to call on NULL */
+               titles_prev_round = titles_this_round;
+               titles_this_round = calloc(titles_bitfield_bytes, 1);
+
+               for (i = 0; i < titles; i++) {
+                       int j;
+
+                       if (!BIT_ISSET(titles_prev_round, i))
+                               continue;
+
+                       for (j = 0; j < linkis[i]; j++) {
+                               art_id x = linki[i][j];
+                               BIT_SET(titles_this_round, x);
+                       }
+               }
+
+               for (i = 0; i < titles_bitfield_elems; i++) {
+                       titles_this_round[i] &= ~titles_seen[i];
+                       titles_seen[i] |= titles_this_round[i];
+               }
+       }
+
+       /* Result is in titles_prev_round */
+
+       cur_dist -= 1;
+
+       printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
+       for (i = 0; i < titles; i++) {
+               if (BIT_ISSET(titles_prev_round, i)) {
+                       printf("  %s\n", title[i]);
+               }
+       }
+
+
+       free(titles_seen);
+       free(titles_prev_round);
+       free(titles_this_round);
+
+       //goto BENCHMARK;
+
+       return 0;
+}