9 typedef __uint32_t art_id;
13 cmpstring_pp_pp(const void *p1, const void *p2)
15 return strcasecmp(* (char * const *) p1, * (char * const *) p2);
19 cmpstring_p_pp(const void *p1, const void *p2)
21 return strcasecmp((char * const) p1, * (char * const *) p2);
34 art_id titles_alloc = 0;
39 art_id link_titles_done = 0;
45 * Read all titles into memory
48 in_file = fopen("titles.txt", "r");
49 while (!feof(in_file)) {
51 ssize_t in_line_len = 0;
54 if (titles == titles_alloc) {
55 titles_alloc += 100000;
56 title = realloc(title, titles_alloc * sizeof(title[0]));
59 in_line_len = getline(&in_line, &zero, in_file);
61 /* Ignore empty lines and errors */
62 if (in_line_len < 2) {
66 /* Delete trailing newline */
67 in_line[in_line_len - 1] = '\0';
69 title[titles] = in_line;
74 qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
76 printf("Sorting done.\n");
80 link = malloc(titles * sizeof(art_id*));
81 links = malloc(titles * sizeof(art_id));
83 in_file = fopen("enwiki-links-plain.txt", "r");
84 while (!feof(in_file)) {
86 ssize_t in_line_len = 0;
89 in_line_len = getline(&in_line, &zero, in_file);
91 /* Ignore empty lines and errors */
92 if (in_line_len < 2) {
93 //printf("%d\n", links[i]);
99 /* Delete trailing newline */
100 in_line[in_line_len - 1] = '\0';
103 && !memcmp(in_line, "~~~~", 4)) {
105 char **cur_title = bsearch(&in_line[4], title,
106 titles, sizeof(title[0]),
110 printf("TITLE NOT FOUND: %s\n", in_line);
114 i = cur_title - title;
115 //printf("%s\n", title[i]);
118 if (0 == (link_titles_done % 100000)) {
119 printf("%d\n", link_titles_done);
124 /* Delete trailing anchor */
125 strtok(in_line, "#");
127 char **cur_link = bsearch(&in_line[0], title,
128 titles, sizeof(title[0]),
132 //printf("LINK NOT FOUND: %s\n", in_line);
138 link[i] = realloc(link[i], links[i] * sizeof(art_id));
140 link[i][links[i] - 1] = cur_link - title;
142 //printf("%s -- %d\n", title[i], links[i]);
149 printf("Links identified.\n");
153 out_file = fopen("titles-sorted.txt", "w");
154 for (i = 0; i < titles; i++) {
155 fputs(title[i], out_file);
156 fputs("\n", out_file);
160 printf("Titles written.\n");
164 out_file = fopen("links-outgoing.bin", "wb");
165 fwrite(&titles, sizeof(titles), 1, out_file);
166 for (i = 0; i < titles; i++) {
169 fwrite(&links[i], sizeof(links[i]), 1, out_file);
171 for (j = 0; j < links[i]; j++) {
172 fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
177 printf("Links written.\n");