#include #include #include #include #include #include #define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) typedef __uint32_t bitfield_type; #define BITS_PER_BITFIELD 32 #define BITS_PER_BITFIELD_LOG 5 #define BIT_SET(set, d) \ ( (void) ( set[d >> BITS_PER_BITFIELD_LOG] |= (1 << (d & (BITS_PER_BITFIELD-1))) ) ) #define BIT_ISSET(set, d) \ (!(! ( set[d >> BITS_PER_BITFIELD_LOG] & (1 << (d & (BITS_PER_BITFIELD-1))) ) )) int ANY_BITS_SET(bitfield_type *set, size_t elems) { size_t i; for (i = 0; i < elems; i++) if (set[i] != 0) return 1; return 0; } typedef __uint32_t art_id; static int cmpstring_p_pp(const void *p1, const void *p2) { return strcasecmp((char * const) p1, * (char * const *) p2); } int main(int argc, char **argv) { FILE *in_file; FILE *out_file; char *title_blob; size_t title_blob_len; char **title = NULL; art_id titles; art_id titles_read = 0; art_id *link_blob; size_t link_blob_len; art_id **linki; art_id *linkis; char **cur_title; art_id title_id; art_id cur_dist; art_id i; bitfield_type *titles_seen; bitfield_type *titles_prev_round; bitfield_type *titles_this_round; size_t titles_bitfield_elems; size_t titles_bitfield_bytes; if (argc < 2) { printf("Supply article name to look up articles linking to it.\n"); return 1; } /* * Read all incoming links into memory */ in_file = fopen("links-incoming.bin", "rb"); fseek(in_file, 0, SEEK_END); link_blob_len = ftell(in_file); rewind(in_file); printf("Link blob size: %zd bytes.\n", link_blob_len); link_blob = malloc(link_blob_len); if (!link_blob) { printf("Failed to allocate memory.\n"); return 1; } if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) { printf("Failed to read entire file in one go.\n"); return 1; } printf("Link blob read (%zd bytes).\n", ftell(in_file)); fclose(in_file); titles = *link_blob; link_blob++; linki = malloc(titles * sizeof(art_id*)); linkis = malloc(titles * sizeof(art_id)); for (i = 0; i < titles; i++) { art_id j; linkis[i] = *link_blob; link_blob++; linki[i] = link_blob; link_blob += linkis[i]; } printf("Incoming links prepared.\n"); /* * Read all titles into memory */ in_file = fopen("titles-sorted.txt", "r"); fseek(in_file, 0, SEEK_END); title_blob_len = ftell(in_file); rewind(in_file); printf("Title blob size: %zd bytes.\n", title_blob_len); title_blob = malloc(title_blob_len + 1); if (!title_blob) { printf("Failed to allocate memory.\n"); return 1; } if (title_blob_len != fread(title_blob, 1, title_blob_len, in_file)) { printf("Failed to read entire file in one go.\n"); return 1; } printf("Title blob read (%zd bytes).\n", ftell(in_file)); fclose(in_file); title_blob[title_blob_len] = '\0'; title = malloc((titles + 1) * sizeof(title[0])); title[0] = title_blob; titles_read = 1; while (title_blob = strchr(title_blob, '\n')) { title_blob[0] = '\0'; title_blob++; title[titles_read] = title_blob; titles_read++; } /* Last title will be an empty string, as the file ends with a newline. */ printf("Titles prepared: %d of %d.\n", titles_read - 1, titles); /* Look up article ID */ cur_title = bsearch(argv[1], title, titles, sizeof(title[0]), cmpstring_p_pp); if (!cur_title) { printf("TITLE NOT FOUND: %s\n", argv[1]); return 1; } title_id = cur_title - title; #if 0 /* Print first degree incoming edges */ printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]); for (i = 0; i < linkis[title_id]; i++) { art_id x = linki[title_id][i]; printf(" %s\n", title[x]); } #endif printf("\n\n\n\nBuilding table of distances...\n\n"); titles_bitfield_elems = ROUNDUP(titles / BITS_PER_BITFIELD, BITS_PER_BITFIELD); titles_bitfield_bytes = titles_bitfield_elems * sizeof(bitfield_type); BENCHMARK: titles_seen = calloc(titles_bitfield_bytes, 1); titles_prev_round = NULL; titles_this_round = calloc(titles_bitfield_bytes, 1); cur_dist = 0; BIT_SET(titles_seen, title_id); BIT_SET(titles_this_round, title_id); while (ANY_BITS_SET(titles_this_round, titles_bitfield_elems)) { art_id i; cur_dist++; printf("Checking distance %d\n", cur_dist); free(titles_prev_round); /* free is safe to call on NULL */ titles_prev_round = titles_this_round; titles_this_round = calloc(titles_bitfield_bytes, 1); for (i = 0; i < titles; i++) { int j; if (!BIT_ISSET(titles_prev_round, i)) continue; for (j = 0; j < linkis[i]; j++) { art_id x = linki[i][j]; BIT_SET(titles_this_round, x); } } for (i = 0; i < titles_bitfield_elems; i++) { titles_this_round[i] &= ~titles_seen[i]; titles_seen[i] |= titles_this_round[i]; } } /* Result is in titles_prev_round */ cur_dist -= 1; printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist); for (i = 0; i < titles; i++) { if (BIT_ISSET(titles_prev_round, i)) { printf(" %s\n", title[i]); } } free(titles_seen); free(titles_prev_round); free(titles_this_round); //goto BENCHMARK; return 0; }