summaryrefslogtreecommitdiff
path: root/links-plain-to-binary.c
diff options
context:
space:
mode:
authornorly <ny-git@enpas.org>2019-07-05 00:14:06 +0200
committernorly <ny-git@enpas.org>2019-07-05 00:14:06 +0200
commitd65a56c2382d31ede44a10cfe559f744660dbc99 (patch)
treececd2469107505121d6d13e4a36e6f15b12545dc /links-plain-to-binary.c
parent07328c1036d1fbb7209bc5551fd086d25dbdfafb (diff)
Add final state from 2018-10-07
Diffstat (limited to 'links-plain-to-binary.c')
-rw-r--r--links-plain-to-binary.c174
1 files changed, 174 insertions, 0 deletions
diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c
new file mode 100644
index 0000000..13ded29
--- /dev/null
+++ b/links-plain-to-binary.c
@@ -0,0 +1,174 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static int
+cmpstring_pp_pp(const void *p1, const void *p2)
+{
+ return strcasecmp(* (char * const *) p1, * (char * const *) p2);
+}
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+ return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main()
+{
+ FILE *in_file;
+ FILE *out_file;
+
+ char **title = NULL;
+ size_t titles = 0;
+ size_t titles_alloc = 0;
+
+ size_t **link;
+ size_t *links;
+
+ size_t link_titles_done = 0;
+
+ size_t i;
+
+
+ /*
+ * Read all titles into memory
+ */
+
+ in_file = fopen("titles.txt", "r");
+ while (!feof(in_file)) {
+ char *in_line = NULL;
+ ssize_t in_line_len = 0;
+ size_t zero = 0;
+
+ if (titles == titles_alloc) {
+ titles_alloc += 100000;
+ title = realloc(title, titles_alloc * sizeof(title[0]));
+ }
+
+ in_line_len = getline(&in_line, &zero, in_file);
+
+ /* Ignore empty lines and errors */
+ if (in_line_len < 2) {
+ continue;
+ }
+
+ /* Delete trailing newline */
+ in_line[in_line_len - 1] = '\0';
+
+ title[titles] = in_line;
+ titles++;
+ }
+ fclose(in_file);
+
+ qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
+
+ printf("Sorting done.\n");
+
+
+
+ link = malloc(titles * sizeof(size_t*));
+ links = malloc(titles * sizeof(size_t));
+
+ in_file = fopen("enwiki-links-plain.txt", "r");
+ while (!feof(in_file)) {
+ char *in_line = NULL;
+ ssize_t in_line_len = 0;
+ size_t zero = 0;
+
+ in_line_len = getline(&in_line, &zero, in_file);
+
+ /* Ignore empty lines and errors */
+ if (in_line_len < 2) {
+ //printf("%d\n", links[i]);
+ if (in_line)
+ free(in_line);
+ continue;
+ }
+
+ /* Delete trailing newline */
+ in_line[in_line_len - 1] = '\0';
+
+ if (in_line_len > 5
+ && !memcmp(in_line, "~~~~", 4)) {
+ /* Title */
+ char **cur_title = bsearch(&in_line[4], title,
+ titles, sizeof(title[0]),
+ cmpstring_p_pp);
+
+ if (!cur_title) {
+ printf("TITLE NOT FOUND: %s\n", in_line);
+ assert(cur_title);
+ }
+
+ i = cur_title - title;
+ //printf("%s\n", title[i]);
+
+ link_titles_done++;
+ if (0 == (link_titles_done % 100000)) {
+ printf("%d\n", link_titles_done);
+ }
+ } else {
+ /* Link */
+
+ /* Delete trailing anchor */
+ strtok(in_line, "#");
+
+ char **cur_link = bsearch(&in_line[0], title,
+ titles, sizeof(title[0]),
+ cmpstring_p_pp);
+
+ if (!cur_link) {
+ //printf("LINK NOT FOUND: %s\n", in_line);
+ free(in_line);
+ continue;
+ }
+
+ links[i]++;
+ link[i] = realloc(link[i], links[i] * sizeof(size_t));
+
+ link[i][links[i] - 1] = cur_link - title;
+
+ //printf("%s -- %d\n", title[i], links[i]);
+ }
+
+ free(in_line);
+ }
+ fclose(in_file);
+
+ printf("Links identified.\n");
+
+
+
+ out_file = fopen("titles-sorted.txt", "w");
+ for (i = 0; i < titles; i++) {
+ fputs(title[i], out_file);
+ fputs("\n", out_file);
+ }
+ fclose(out_file);
+
+ printf("Titles written.\n");
+
+
+
+ out_file = fopen("links-outgoing.bin", "wb");
+ fwrite(&titles, sizeof(titles), 1, out_file);
+ for (i = 0; i < titles; i++) {
+ size_t j;
+
+ fwrite(&links[i], sizeof(links[i]), 1, out_file);
+
+ for (j = 0; j < links[i]; j++) {
+ fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
+ }
+ }
+ fclose(out_file);
+
+ printf("Links written.\n");
+}