005a49604b3b82cd416df8548f7a0bd4e031b9f2
[enwiki-links-graph.git] / links-plain-to-binary.c
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <sys/types.h>
7
8
9 typedef __uint32_t art_id;
10
11
12 static int
13 cmpstring_pp_pp(const void *p1, const void *p2)
14 {
15         return strcasecmp(* (char * const *) p1, * (char * const *) p2);
16 }
17
18 static int
19 cmpstring_p_pp(const void *p1, const void *p2)
20 {
21         return strcasecmp((char * const) p1, * (char * const *) p2);
22 }
23
24
25
26
27 int main()
28 {
29         FILE *in_file;
30         FILE *out_file;
31
32         char **title = NULL;
33         art_id titles = 0;
34         art_id titles_alloc = 0;
35
36         art_id **link;
37         art_id *links;
38
39         art_id link_titles_done = 0;
40
41         art_id i;
42
43
44         /*
45          * Read all titles into memory
46          */
47
48         in_file = fopen("titles.txt", "r");
49         while (!feof(in_file)) {
50                 char *in_line = NULL;
51                 ssize_t in_line_len = 0;
52                 size_t zero = 0;
53
54                 if (titles == titles_alloc) {
55                         titles_alloc += 100000;
56                         title = realloc(title, titles_alloc * sizeof(title[0]));
57                 }
58
59                 in_line_len = getline(&in_line, &zero, in_file);
60
61                 /* Ignore empty lines and errors */
62                 if (in_line_len < 2) {
63                         continue;
64                 }
65
66                 /* Delete trailing newline */
67                 in_line[in_line_len - 1] = '\0';
68
69                 title[titles] = in_line;
70                 titles++;
71         }
72         fclose(in_file);
73
74         qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
75
76         printf("Sorting done.\n");
77
78
79
80         link = malloc(titles * sizeof(art_id*));
81         links = malloc(titles * sizeof(art_id));
82
83         in_file = fopen("enwiki-links-plain.txt", "r");
84         while (!feof(in_file)) {
85                 char *in_line = NULL;
86                 ssize_t in_line_len = 0;
87                 size_t zero = 0;
88
89                 in_line_len = getline(&in_line, &zero, in_file);
90
91                 /* Ignore empty lines and errors */
92                 if (in_line_len < 2) {
93                         //printf("%d\n", links[i]);
94                         if (in_line)
95                                 free(in_line);
96                         continue;
97                 }
98
99                 /* Delete trailing newline */
100                 in_line[in_line_len - 1] = '\0';
101
102                 if (in_line_len > 5
103                     && !memcmp(in_line, "~~~~", 4)) {
104                         /* Title */
105                         char **cur_title = bsearch(&in_line[4], title,
106                                                 titles, sizeof(title[0]),
107                                                 cmpstring_p_pp);
108
109                         if (!cur_title) {
110                                 printf("TITLE NOT FOUND: %s\n", in_line);
111                                 assert(cur_title);
112                         }
113
114                         i = cur_title - title;
115                         //printf("%s\n", title[i]);
116
117                         link_titles_done++;
118                         if (0 == (link_titles_done % 100000)) {
119                                 printf("%d\n", link_titles_done);
120                         }
121                 } else {
122                         /* Link */
123
124                         /* Delete trailing anchor */
125                         strtok(in_line, "#");
126
127                         char **cur_link = bsearch(&in_line[0], title,
128                                                 titles, sizeof(title[0]),
129                                                 cmpstring_p_pp);
130
131                         if (!cur_link) {
132                                 //printf("LINK NOT FOUND: %s\n", in_line);
133                                 free(in_line);
134                                 continue;
135                         }
136
137                         links[i]++;
138                         link[i] = realloc(link[i], links[i] * sizeof(art_id));
139
140                         link[i][links[i] - 1] = cur_link - title;
141
142                         //printf("%s -- %d\n", title[i], links[i]);
143                 }
144
145                 free(in_line);
146         }
147         fclose(in_file);
148
149         printf("Links identified.\n");
150
151
152
153         out_file = fopen("titles-sorted.txt", "w");
154         for (i = 0; i < titles; i++) {
155                 fputs(title[i], out_file);
156                 fputs("\n", out_file);
157         }
158         fclose(out_file);
159
160         printf("Titles written.\n");
161
162
163
164         out_file = fopen("links-outgoing.bin", "wb");
165         fwrite(&titles, sizeof(titles), 1, out_file);
166         for (i = 0; i < titles; i++) {
167                 art_id j;
168
169                 fwrite(&links[i], sizeof(links[i]), 1, out_file);
170
171                 for (j = 0; j < links[i]; j++) {
172                         fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
173                 }
174         }
175         fclose(out_file);
176
177         printf("Links written.\n");
178 }