50f2dbc7685c84480c5d9c5fa5989a29ce3297b9
[enwiki-links-graph.git] / lookup-incoming.c
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6
7
8 static int
9 cmpstring_p_pp(const void *p1, const void *p2)
10 {
11         return strcasecmp((char * const) p1, * (char * const *) p2);
12 }
13
14
15
16
17 int main(int argc, char **argv)
18 {
19         FILE *in_file;
20         FILE *out_file;
21
22         char **title = NULL;
23         size_t titles;
24         size_t titles_read = 0;
25
26         size_t **linki;
27         size_t *linkis;
28
29         char **cur_title;
30         size_t title_id;
31
32         size_t *dist_table;
33         size_t cur_dist;
34         int cur_dist_is_not_last;
35
36         size_t i;
37
38
39         if (argc < 2) {
40                 printf("Supply article name to look up articles linking to it.\n");
41                 return 1;
42         }
43
44
45         /*
46          * Read all incoming links into memory
47          */
48
49         in_file = fopen("links-incoming.bin", "rb");
50
51         fread(&titles, sizeof(titles), 1, in_file);
52
53         linki = malloc(titles * sizeof(size_t*));
54         linkis = malloc(titles * sizeof(size_t));
55
56         for (i = 0; i < titles; i++) {
57                 size_t j;
58
59                 fread(&linkis[i], sizeof(linkis[i]), 1, in_file);
60                 //printf("linkis[%zd] = %zd\n", i, linkis[i]);
61
62                 linki[i] = malloc(linkis[i] * sizeof(linki[i][0]));
63
64                 j = fread(linki[i], sizeof(linki[i][0]), linkis[i], in_file);
65                 assert(j == linkis[i]);
66                 //for (j = 0; j < linkis[i]; j++) {
67                 //      fread(&linki[i][j], sizeof(linki[i][j]), 1, in_file);
68                 //}
69         }
70         printf("Incoming links read (%zd bytes).\n", ftell(in_file));
71         fclose(in_file);
72
73
74
75
76         /*
77          * Read all titles into memory
78          */
79
80         title = malloc(titles * sizeof(title[0]));
81
82         in_file = fopen("titles-sorted.txt", "r");
83         while (!feof(in_file)) {
84                 char *in_line = NULL;
85                 ssize_t in_line_len = 0;
86                 size_t zero = 0;
87
88                 in_line_len = getline(&in_line, &zero, in_file);
89
90                 /* Ignore empty lines and errors */
91                 if (in_line_len < 2) {
92                         continue;
93                 }
94
95                 /* Delete trailing newline */
96                 in_line[in_line_len - 1] = '\0';
97
98                 title[titles_read] = in_line;
99                 titles_read++;
100         }
101         fclose(in_file);
102
103         printf("Titles read.\n");
104
105
106
107
108         /* Look up article ID */
109         cur_title = bsearch(argv[1], title,
110                                 titles, sizeof(title[0]),
111                                 cmpstring_p_pp);
112
113         if (!cur_title) {
114                 printf("TITLE NOT FOUND: %s\n", argv[1]);
115                 return 1;
116         }
117
118         title_id = cur_title - title;
119
120         printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
121
122         for (i = 0; i < linkis[title_id]; i++) {
123                 size_t x = linki[title_id][i];
124                 printf("  %s\n", title[x]);
125         }
126
127
128
129         printf("\n\n\n\nBuilding table of distances...\n\n");
130
131         dist_table = malloc(titles * sizeof(dist_table[0]));
132
133         dist_table[title_id] = 0xdeadbeef;
134
135         for (i = 0; i < linkis[title_id]; i++) {
136                 size_t x = linki[title_id][i];
137                 dist_table[x] = 1;
138         }
139
140         cur_dist = 1;
141         cur_dist_is_not_last = 1;
142
143         while (cur_dist_is_not_last) {
144                 size_t articles_found = 0;
145
146                 cur_dist_is_not_last = 0;
147
148                 for (i = 0; i < titles; i++) {
149                         if (dist_table[i] == cur_dist) {
150                                 size_t j;
151
152                                 for (j = 0; j < linkis[i]; j++) {
153                                         size_t x = linki[i][j];
154
155                                         if (!dist_table[x]) {
156                                                 dist_table[x] = cur_dist + 1;
157                                         }
158                                 }
159
160                                 cur_dist_is_not_last = 1;
161
162                                 articles_found++;
163                         }
164                 }
165
166                 printf("Distance %zd: Found %zd articles.\n", cur_dist, articles_found);
167                 cur_dist++;
168         }
169
170         cur_dist -= 2;
171
172         printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
173         for (i = 0; i < titles; i++) {
174                 if (dist_table[i] == cur_dist) {
175                         printf("  %s\n", title[i]);
176                 }
177         }
178
179
180         return 0;
181 }