diff options
author | norly <ny-git@enpas.org> | 2019-07-14 17:28:47 +0200 |
---|---|---|
committer | norly <ny-git@enpas.org> | 2019-07-14 17:28:47 +0200 |
commit | f0f54296b5b445c6ce0e47486bcdcb0deca582ff (patch) | |
tree | 35c3858bab40f4bf8f6c57e2d5522f17d2928511 /src/lookup-incoming.c | |
parent | 64907e38005ada5b2a545ae58f05d0fd616ffa79 (diff) |
Move to Makefile and .gitignore
Diffstat (limited to 'src/lookup-incoming.c')
-rw-r--r-- | src/lookup-incoming.c | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/src/lookup-incoming.c b/src/lookup-incoming.c new file mode 100644 index 0000000..2d3fa9c --- /dev/null +++ b/src/lookup-incoming.c @@ -0,0 +1,257 @@ +#include <assert.h> +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> + + +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) + + +typedef __uint32_t bitfield_type; +#define BITS_PER_BITFIELD 32 +#define BITS_PER_BITFIELD_LOG 5 + +#define BIT_SET(set, d) \ + ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) ) +#define BIT_ISSET(set, d) \ + (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) )) + +int ANY_BITS_SET(bitfield_type *set, size_t elems) +{ + size_t i; + + for (i = 0; i < elems; i++) + if (set[i] != 0) + return 1; + + return 0; +} + + +typedef __uint32_t art_id; + + + +static int +cmpstring_p_pp(const void *p1, const void *p2) +{ + return strcasecmp((char * const) p1, * (char * const *) p2); +} + + + + +int main(int argc, char **argv) +{ + FILE *in_file; + FILE *out_file; + + char *title_blob; + size_t title_blob_len; + char **title = NULL; + art_id titles; + art_id titles_read = 0; + + art_id *link_blob; + size_t link_blob_len; + art_id **linki; + art_id *linkis; + + char **cur_title; + art_id title_id; + + art_id cur_dist; + + art_id i; + + bitfield_type *titles_seen; + bitfield_type *titles_prev_round; + bitfield_type *titles_this_round; + size_t titles_bitfield_elems; + size_t titles_bitfield_bytes; + + + if (argc < 2) { + printf("Supply article name to look up articles linking to it.\n"); + return 1; + } + + + /* + * Read all incoming links into memory + */ + + in_file = fopen("links-incoming.bin", "rb"); + fseek(in_file, 0, SEEK_END); + link_blob_len = ftell(in_file); + rewind(in_file); + printf("Link blob size: %zd bytes.\n", link_blob_len); + + link_blob = malloc(link_blob_len); + if (!link_blob) { + printf("Failed to allocate memory.\n"); + return 1; + } + + if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) { + printf("Failed to read entire file in one go.\n"); + return 1; + } + printf("Link blob read (%zd bytes).\n", ftell(in_file)); + fclose(in_file); + + titles = *link_blob; + link_blob++; + + linki = malloc(titles * sizeof(art_id*)); + linkis = malloc(titles * sizeof(art_id)); + + for (i = 0; i < titles; i++) { + art_id j; + + linkis[i] = *link_blob; + link_blob++; + + linki[i] = link_blob; + link_blob += linkis[i]; + } + printf("Incoming links prepared.\n"); + + + + + + /* + * Read all titles into memory + */ + + + + in_file = fopen("titles-sorted.txt", "r"); + fseek(in_file, 0, SEEK_END); + title_blob_len = ftell(in_file); + rewind(in_file); + printf("Title blob size: %zd bytes.\n", title_blob_len); + + title_blob = malloc(title_blob_len + 1); + if (!title_blob) { + printf("Failed to allocate memory.\n"); + return 1; + } + + if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) { + printf("Failed to read entire file in one go.\n"); + return 1; + } + printf("Title blob read (%zd bytes).\n", ftell(in_file)); + fclose(in_file); + + title_blob[title_blob_len] = '\0'; + + title = malloc((titles + 1) * sizeof(title[0])); + + title[0] = title_blob; + titles_read = 1; + while (title_blob = strchr(title_blob, '\n')) { + title_blob[0] = '\0'; + title_blob++; + title[titles_read] = title_blob; + titles_read++; + } + + /* Last title will be an empty string, as the file ends with a newline. */ + printf("Titles prepared: %d of %d.\n", titles_read - 1, titles); + + + + + /* Look up article ID */ + cur_title = bsearch(argv[1], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_title) { + printf("TITLE NOT FOUND: %s\n", argv[1]); + return 1; + } + + title_id = cur_title - title; + + + #if 0 + /* Print first degree incoming edges */ + printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]); + for (i = 0; i < linkis[title_id]; i++) { + art_id x = linki[title_id][i]; + printf(" %s\n", title[x]); + } + #endif + + + + printf("\n\n\n\nBuilding table of distances...\n\n"); + + titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD); + titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type); + +BENCHMARK: + + titles_seen = calloc(titles_bitfield_bytes, 1); + titles_prev_round = NULL; + titles_this_round = calloc(titles_bitfield_bytes, 1); + + cur_dist = 0; + + BIT_SET(titles_seen, title_id); + BIT_SET(titles_this_round, title_id); + + while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) { + art_id i; + + cur_dist++; + printf("Checking distance %d\n", cur_dist); + + free(titles_prev_round); /* free is safe to call on NULL */ + titles_prev_round = titles_this_round; + titles_this_round = calloc(titles_bitfield_bytes, 1); + + for (i = 0; i < titles; i++) { + int j; + + if (!BIT_ISSET(titles_prev_round, i)) + continue; + + for (j = 0; j < linkis[i]; j++) { + art_id x = linki[i][j]; + BIT_SET(titles_this_round, x); + } + } + + for (i = 0; i < titles_bitfield_elems; i++) { + titles_this_round[i] &= ~titles_seen[i]; + titles_seen[i] |= titles_this_round[i]; + } + } + + /* Result is in titles_prev_round */ + + cur_dist -= 1; + + printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist); + for (i = 0; i < titles; i++) { + if (BIT_ISSET(titles_prev_round, i)) { + printf(" %s\n", title[i]); + } + } + + + free(titles_seen); + free(titles_prev_round); + free(titles_this_round); + + //goto BENCHMARK; + + return 0; +} |