#include #include #include #include #include #include typedef __uint32_t art_id; static int cmpstring_pp_pp(const void *p1, const void *p2) { return strcasecmp(* (char * const *) p1, * (char * const *) p2); } static int cmpstring_p_pp(const void *p1, const void *p2) { return strcasecmp((char * const) p1, * (char * const *) p2); } int main() { FILE *in_file; FILE *out_file; char **title = NULL; art_id titles = 0; art_id titles_alloc = 0; art_id **link; art_id *links; art_id link_titles_done = 0; art_id i; /* * Read all titles into memory */ in_file = fopen("titles.txt", "r"); while (!feof(in_file)) { char *in_line = NULL; ssize_t in_line_len = 0; size_t zero = 0; if (titles == titles_alloc) { titles_alloc += 100000; title = realloc(title, titles_alloc * sizeof(title[0])); } in_line_len = getline(&in_line, &zero, in_file); /* Ignore empty lines and errors */ if (in_line_len < 2) { continue; } /* Delete trailing newline */ in_line[in_line_len - 1] = '\0'; title[titles] = in_line; titles++; } fclose(in_file); qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp); printf("Sorting done.\n"); link = malloc(titles * sizeof(art_id*)); links = malloc(titles * sizeof(art_id)); in_file = fopen("enwiki-links-plain.txt", "r"); while (!feof(in_file)) { char *in_line = NULL; ssize_t in_line_len = 0; size_t zero = 0; in_line_len = getline(&in_line, &zero, in_file); /* Ignore empty lines and errors */ if (in_line_len < 2) { //printf("%d\n", links[i]); if (in_line) free(in_line); continue; } /* Delete trailing newline */ in_line[in_line_len - 1] = '\0'; if (in_line_len > 5 && !memcmp(in_line, "~~~~", 4)) { /* Title */ char **cur_title = bsearch(&in_line[4], title, titles, sizeof(title[0]), cmpstring_p_pp); if (!cur_title) { printf("TITLE NOT FOUND: %s\n", in_line); assert(cur_title); } i = cur_title - title; //printf("%s\n", title[i]); link_titles_done++; if (0 == (link_titles_done % 100000)) { printf("%d\n", link_titles_done); } } else { /* Link */ /* Delete trailing anchor */ strtok(in_line, "#"); char **cur_link = bsearch(&in_line[0], title, titles, sizeof(title[0]), cmpstring_p_pp); if (!cur_link) { //printf("LINK NOT FOUND: %s\n", in_line); free(in_line); continue; } links[i]++; link[i] = realloc(link[i], links[i] * sizeof(art_id)); link[i][links[i] - 1] = cur_link - title; //printf("%s -- %d\n", title[i], links[i]); } free(in_line); } fclose(in_file); printf("Links identified.\n"); out_file = fopen("titles-sorted.txt", "w"); for (i = 0; i < titles; i++) { fputs(title[i], out_file); fputs("\n", out_file); } fclose(out_file); printf("Titles written.\n"); out_file = fopen("links-outgoing.bin", "wb"); fwrite(&titles, sizeof(titles), 1, out_file); for (i = 0; i < titles; i++) { art_id j; fwrite(&links[i], sizeof(links[i]), 1, out_file); for (j = 0; j < links[i]; j++) { fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file); } } fclose(out_file); printf("Links written.\n"); }