lookup: Accelerate reading link database
[enwiki-links-graph.git] / lookup-incoming.c
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <sys/types.h>
7
8
9 #define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
10
11 #define BIT_SET(set, d) \
12   ( (void) ( set[d >> 3] |= (1 << (d & 7)) ) )
13 #define BIT_ISSET(set, d) \
14   (!(! ( set[d >> 3] & (1 << (d & 7)) ) ))
15
16 int ANY_BITS_SET(unsigned char *set, size_t bytes)
17 {
18         size_t i;
19
20         for (i = 0; i < bytes; i++)
21                 if (set[i] != 0)
22                         return 1;
23
24         return 0;
25 }
26
27
28 typedef __uint32_t art_id;
29
30
31
32 static int
33 cmpstring_p_pp(const void *p1, const void *p2)
34 {
35         return strcasecmp((char * const) p1, * (char * const *) p2);
36 }
37
38
39
40
41 int main(int argc, char **argv)
42 {
43         FILE *in_file;
44         FILE *out_file;
45
46         char **title = NULL;
47         art_id titles;
48         art_id titles_read = 0;
49
50         art_id *link_blob;
51         size_t link_blob_len;
52         art_id **linki;
53         art_id *linkis;
54
55         char **cur_title;
56         art_id title_id;
57
58         art_id cur_dist;
59
60         art_id i;
61
62         unsigned char *titles_seen;
63         unsigned char *titles_prev_round;
64         unsigned char *titles_this_round;
65         size_t titles_bitfield_len;
66
67
68         if (argc < 2) {
69                 printf("Supply article name to look up articles linking to it.\n");
70                 return 1;
71         }
72
73
74         /*
75          * Read all incoming links into memory
76          */
77
78         in_file = fopen("links-incoming.bin", "rb");
79         fseek(in_file, 0, SEEK_END);
80         link_blob_len = ftell(in_file);
81         rewind(in_file);
82         printf("Link blob size: %zd bytes.\n", link_blob_len);
83
84         link_blob = malloc(link_blob_len);
85         if (!link_blob) {
86                 printf("Failed to allocate memory.\n");
87                 return 1;
88         }
89
90         if (link_blob_len != fread(link_blob, 1, link_blob_len, in_file)) {
91                 printf("Failed to read entire file in one go.\n");
92                 return 1;
93         }
94         printf("Link blob read (%zd bytes).\n", ftell(in_file));
95         fclose(in_file);
96
97         titles = *link_blob;
98         link_blob++;
99
100         linki = malloc(titles * sizeof(art_id*));
101         linkis = malloc(titles * sizeof(art_id));
102
103         for (i = 0; i < titles; i++) {
104                 art_id j;
105
106                 linkis[i] = *link_blob;
107                 link_blob++;
108
109                 linki[i] = link_blob;
110                 link_blob += linkis[i];
111         }
112         printf("Incoming links prepared.\n");
113
114
115
116
117
118         /*
119          * Read all titles into memory
120          */
121
122         title = malloc(titles * sizeof(title[0]));
123
124         in_file = fopen("titles-sorted.txt", "r");
125         while (!feof(in_file)) {
126                 char *in_line = NULL;
127                 ssize_t in_line_len = 0;
128                 size_t zero = 0;
129
130                 in_line_len = getline(&in_line, &zero, in_file);
131
132                 /* Ignore empty lines and errors */
133                 if (in_line_len < 2) {
134                         free(in_line);
135                         continue;
136                 }
137
138                 /* Delete trailing newline */
139                 in_line[in_line_len - 1] = '\0';
140
141                 title[titles_read] = in_line;
142                 titles_read++;
143         }
144         fclose(in_file);
145
146         printf("Titles read.\n");
147
148
149
150
151         /* Look up article ID */
152         cur_title = bsearch(argv[1], title,
153                                 titles, sizeof(title[0]),
154                                 cmpstring_p_pp);
155
156         if (!cur_title) {
157                 printf("TITLE NOT FOUND: %s\n", argv[1]);
158                 return 1;
159         }
160
161         title_id = cur_title - title;
162
163
164         #if 0
165         /* Print first degree incoming edges */
166         printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
167         for (i = 0; i < linkis[title_id]; i++) {
168                 art_id x = linki[title_id][i];
169                 printf("  %s\n", title[x]);
170         }
171         #endif
172
173
174
175         printf("\n\n\n\nBuilding table of distances...\n\n");
176
177         titles_bitfield_len = ROUNDUP(titles / 8, sizeof(size_t));
178
179         titles_seen = calloc(titles_bitfield_len, 1);
180         titles_prev_round = NULL;
181         titles_this_round = calloc(titles_bitfield_len, 1);
182
183         cur_dist = 0;
184
185         BIT_SET(titles_seen, title_id);
186         BIT_SET(titles_this_round, title_id);
187
188         while (ANY_BITS_SET(titles_this_round, titles_bitfield_len)) {
189                 art_id i;
190
191                 cur_dist++;
192                 printf("Checking distance %d\n", cur_dist);
193
194                 free(titles_prev_round); /* free is safe to call on NULL */
195                 titles_prev_round = titles_this_round;
196                 titles_this_round = calloc(titles_bitfield_len, 1);
197
198                 for (i = 0; i < titles; i++) {
199                         int j;
200
201                         if (!BIT_ISSET(titles_prev_round, i))
202                                 continue;
203
204                         for (j = 0; j < linkis[i]; j++) {
205                                 art_id x = linki[i][j];
206                                 BIT_SET(titles_this_round, x);
207                         }
208                 }
209
210                 for (i = 0; i < titles_bitfield_len; i++) {
211                         titles_this_round[i] &= ~titles_seen[i];
212                         titles_seen[i] |= titles_this_round[i];
213                 }
214         }
215
216         /* Result is in titles_prev_round */
217
218         cur_dist -= 1;
219
220         printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
221         for (i = 0; i < titles; i++) {
222                 if (BIT_ISSET(titles_prev_round, i)) {
223                         printf("  %s\n", title[i]);
224                 }
225         }
226
227
228         return 0;
229 }