13ded29aa9d6f9f9f113d5e26011b11668e20c9e
[enwiki-links-graph.git] / links-plain-to-binary.c
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6
7
8 static int
9 cmpstring_pp_pp(const void *p1, const void *p2)
10 {
11         return strcasecmp(* (char * const *) p1, * (char * const *) p2);
12 }
13
14 static int
15 cmpstring_p_pp(const void *p1, const void *p2)
16 {
17         return strcasecmp((char * const) p1, * (char * const *) p2);
18 }
19
20
21
22
23 int main()
24 {
25         FILE *in_file;
26         FILE *out_file;
27
28         char **title = NULL;
29         size_t titles = 0;
30         size_t titles_alloc = 0;
31
32         size_t **link;
33         size_t *links;
34
35         size_t link_titles_done = 0;
36
37         size_t i;
38
39
40         /*
41          * Read all titles into memory
42          */
43
44         in_file = fopen("titles.txt", "r");
45         while (!feof(in_file)) {
46                 char *in_line = NULL;
47                 ssize_t in_line_len = 0;
48                 size_t zero = 0;
49
50                 if (titles == titles_alloc) {
51                         titles_alloc += 100000;
52                         title = realloc(title, titles_alloc * sizeof(title[0]));
53                 }
54
55                 in_line_len = getline(&in_line, &zero, in_file);
56
57                 /* Ignore empty lines and errors */
58                 if (in_line_len < 2) {
59                         continue;
60                 }
61
62                 /* Delete trailing newline */
63                 in_line[in_line_len - 1] = '\0';
64
65                 title[titles] = in_line;
66                 titles++;
67         }
68         fclose(in_file);
69
70         qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
71
72         printf("Sorting done.\n");
73
74
75
76         link = malloc(titles * sizeof(size_t*));
77         links = malloc(titles * sizeof(size_t));
78
79         in_file = fopen("enwiki-links-plain.txt", "r");
80         while (!feof(in_file)) {
81                 char *in_line = NULL;
82                 ssize_t in_line_len = 0;
83                 size_t zero = 0;
84
85                 in_line_len = getline(&in_line, &zero, in_file);
86
87                 /* Ignore empty lines and errors */
88                 if (in_line_len < 2) {
89                         //printf("%d\n", links[i]);
90                         if (in_line)
91                                 free(in_line);
92                         continue;
93                 }
94
95                 /* Delete trailing newline */
96                 in_line[in_line_len - 1] = '\0';
97
98                 if (in_line_len > 5
99                     && !memcmp(in_line, "~~~~", 4)) {
100                         /* Title */
101                         char **cur_title = bsearch(&in_line[4], title,
102                                                 titles, sizeof(title[0]),
103                                                 cmpstring_p_pp);
104
105                         if (!cur_title) {
106                                 printf("TITLE NOT FOUND: %s\n", in_line);
107                                 assert(cur_title);
108                         }
109
110                         i = cur_title - title;
111                         //printf("%s\n", title[i]);
112
113                         link_titles_done++;
114                         if (0 == (link_titles_done % 100000)) {
115                                 printf("%d\n", link_titles_done);
116                         }
117                 } else {
118                         /* Link */
119
120                         /* Delete trailing anchor */
121                         strtok(in_line, "#");
122
123                         char **cur_link = bsearch(&in_line[0], title,
124                                                 titles, sizeof(title[0]),
125                                                 cmpstring_p_pp);
126
127                         if (!cur_link) {
128                                 //printf("LINK NOT FOUND: %s\n", in_line);
129                                 free(in_line);
130                                 continue;
131                         }
132
133                         links[i]++;
134                         link[i] = realloc(link[i], links[i] * sizeof(size_t));
135
136                         link[i][links[i] - 1] = cur_link - title;
137
138                         //printf("%s -- %d\n", title[i], links[i]);
139                 }
140
141                 free(in_line);
142         }
143         fclose(in_file);
144
145         printf("Links identified.\n");
146
147
148
149         out_file = fopen("titles-sorted.txt", "w");
150         for (i = 0; i < titles; i++) {
151                 fputs(title[i], out_file);
152                 fputs("\n", out_file);
153         }
154         fclose(out_file);
155
156         printf("Titles written.\n");
157
158
159
160         out_file = fopen("links-outgoing.bin", "wb");
161         fwrite(&titles, sizeof(titles), 1, out_file);
162         for (i = 0; i < titles; i++) {
163                 size_t j;
164
165                 fwrite(&links[i], sizeof(links[i]), 1, out_file);
166
167                 for (j = 0; j < links[i]; j++) {
168                         fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
169                 }
170         }
171         fclose(out_file);
172
173         printf("Links written.\n");
174 }