Move to __uint32_t from glibc's bits/types.c
[enwiki-links-graph.git] / lookup-incoming.c
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <sys/types.h>
7
8
9 typedef __uint32_t art_id;
10
11
12 static int
13 cmpstring_p_pp(const void *p1, const void *p2)
14 {
15         return strcasecmp((char * const) p1, * (char * const *) p2);
16 }
17
18
19
20
21 int main(int argc, char **argv)
22 {
23         FILE *in_file;
24         FILE *out_file;
25
26         char **title = NULL;
27         art_id titles;
28         art_id titles_read = 0;
29
30         art_id **linki;
31         art_id *linkis;
32
33         char **cur_title;
34         art_id title_id;
35
36         art_id *dist_table;
37         art_id cur_dist;
38         int cur_dist_is_not_last;
39
40         art_id i;
41
42
43         if (argc < 2) {
44                 printf("Supply article name to look up articles linking to it.\n");
45                 return 1;
46         }
47
48
49         /*
50          * Read all incoming links into memory
51          */
52
53         in_file = fopen("links-incoming.bin", "rb");
54
55         fread(&titles, sizeof(titles), 1, in_file);
56
57         linki = malloc(titles * sizeof(art_id*));
58         linkis = malloc(titles * sizeof(art_id));
59
60         for (i = 0; i < titles; i++) {
61                 art_id j;
62
63                 fread(&linkis[i], sizeof(linkis[i]), 1, in_file);
64                 //printf("linkis[%zd] = %zd\n", i, linkis[i]);
65
66                 linki[i] = malloc(linkis[i] * sizeof(linki[i][0]));
67
68                 j = fread(linki[i], sizeof(linki[i][0]), linkis[i], in_file);
69                 assert(j == linkis[i]);
70                 //for (j = 0; j < linkis[i]; j++) {
71                 //      fread(&linki[i][j], sizeof(linki[i][j]), 1, in_file);
72                 //}
73         }
74         printf("Incoming links read (%zd bytes).\n", ftell(in_file));
75         fclose(in_file);
76
77
78
79
80         /*
81          * Read all titles into memory
82          */
83
84         title = malloc(titles * sizeof(title[0]));
85
86         in_file = fopen("titles-sorted.txt", "r");
87         while (!feof(in_file)) {
88                 char *in_line = NULL;
89                 ssize_t in_line_len = 0;
90                 size_t zero = 0;
91
92                 in_line_len = getline(&in_line, &zero, in_file);
93
94                 /* Ignore empty lines and errors */
95                 if (in_line_len < 2) {
96                         continue;
97                 }
98
99                 /* Delete trailing newline */
100                 in_line[in_line_len - 1] = '\0';
101
102                 title[titles_read] = in_line;
103                 titles_read++;
104         }
105         fclose(in_file);
106
107         printf("Titles read.\n");
108
109
110
111
112         /* Look up article ID */
113         cur_title = bsearch(argv[1], title,
114                                 titles, sizeof(title[0]),
115                                 cmpstring_p_pp);
116
117         if (!cur_title) {
118                 printf("TITLE NOT FOUND: %s\n", argv[1]);
119                 return 1;
120         }
121
122         title_id = cur_title - title;
123
124         printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
125
126         for (i = 0; i < linkis[title_id]; i++) {
127                 art_id x = linki[title_id][i];
128                 printf("  %s\n", title[x]);
129         }
130
131
132
133         printf("\n\n\n\nBuilding table of distances...\n\n");
134
135         dist_table = malloc(titles * sizeof(dist_table[0]));
136
137         dist_table[title_id] = 0xdeadbeef;
138
139         for (i = 0; i < linkis[title_id]; i++) {
140                 art_id x = linki[title_id][i];
141                 dist_table[x] = 1;
142         }
143
144         cur_dist = 1;
145         cur_dist_is_not_last = 1;
146
147         while (cur_dist_is_not_last) {
148                 art_id articles_found = 0;
149
150                 cur_dist_is_not_last = 0;
151
152                 for (i = 0; i < titles; i++) {
153                         if (dist_table[i] == cur_dist) {
154                                 art_id j;
155
156                                 for (j = 0; j < linkis[i]; j++) {
157                                         art_id x = linki[i][j];
158
159                                         if (!dist_table[x]) {
160                                                 dist_table[x] = cur_dist + 1;
161                                         }
162                                 }
163
164                                 cur_dist_is_not_last = 1;
165
166                                 articles_found++;
167                         }
168                 }
169
170                 printf("Distance %zd: Found %zd articles.\n", cur_dist, articles_found);
171                 cur_dist++;
172         }
173
174         cur_dist -= 2;
175
176         printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
177         for (i = 0; i < titles; i++) {
178                 if (dist_table[i] == cur_dist) {
179                         printf("  %s\n", title[i]);
180                 }
181         }
182
183
184         return 0;
185 }