diff options
author | norly <ny-git@enpas.org> | 2019-07-14 17:28:47 +0200 |
---|---|---|
committer | norly <ny-git@enpas.org> | 2019-07-14 17:28:47 +0200 |
commit | f0f54296b5b445c6ce0e47486bcdcb0deca582ff (patch) | |
tree | 35c3858bab40f4bf8f6c57e2d5522f17d2928511 /links-plain-to-binary.c | |
parent | 64907e38005ada5b2a545ae58f05d0fd616ffa79 (diff) |
Move to Makefile and .gitignore
Diffstat (limited to 'links-plain-to-binary.c')
-rw-r--r-- | links-plain-to-binary.c | 178 |
1 files changed, 0 insertions, 178 deletions
diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c deleted file mode 100644 index 005a496..0000000 --- a/links-plain-to-binary.c +++ /dev/null @@ -1,178 +0,0 @@ -#include <assert.h> -#include <ctype.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> - - -typedef __uint32_t art_id; - - -static int -cmpstring_pp_pp(const void *p1, const void *p2) -{ - return strcasecmp(* (char * const *) p1, * (char * const *) p2); -} - -static int -cmpstring_p_pp(const void *p1, const void *p2) -{ - return strcasecmp((char * const) p1, * (char * const *) p2); -} - - - - -int main() -{ - FILE *in_file; - FILE *out_file; - - char **title = NULL; - art_id titles = 0; - art_id titles_alloc = 0; - - art_id **link; - art_id *links; - - art_id link_titles_done = 0; - - art_id i; - - - /* - * Read all titles into memory - */ - - in_file = fopen("titles.txt", "r"); - while (!feof(in_file)) { - char *in_line = NULL; - ssize_t in_line_len = 0; - size_t zero = 0; - - if (titles == titles_alloc) { - titles_alloc += 100000; - title = realloc(title, titles_alloc * sizeof(title[0])); - } - - in_line_len = getline(&in_line, &zero, in_file); - - /* Ignore empty lines and errors */ - if (in_line_len < 2) { - continue; - } - - /* Delete trailing newline */ - in_line[in_line_len - 1] = '\0'; - - title[titles] = in_line; - titles++; - } - fclose(in_file); - - qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp); - - printf("Sorting done.\n"); - - - - link = malloc(titles * sizeof(art_id*)); - links = malloc(titles * sizeof(art_id)); - - in_file = fopen("enwiki-links-plain.txt", "r"); - while (!feof(in_file)) { - char *in_line = NULL; - ssize_t in_line_len = 0; - size_t zero = 0; - - in_line_len = getline(&in_line, &zero, in_file); - - /* Ignore empty lines and errors */ - if (in_line_len < 2) { - //printf("%d\n", links[i]); - if (in_line) - free(in_line); - continue; - } - - /* Delete trailing newline */ - in_line[in_line_len - 1] = '\0'; - - if (in_line_len > 5 - && !memcmp(in_line, "~~~~", 4)) { - /* Title */ - char **cur_title = bsearch(&in_line[4], title, - titles, sizeof(title[0]), - cmpstring_p_pp); - - if (!cur_title) { - printf("TITLE NOT FOUND: %s\n", in_line); - assert(cur_title); - } - - i = cur_title - title; - //printf("%s\n", title[i]); - - link_titles_done++; - if (0 == (link_titles_done % 100000)) { - printf("%d\n", link_titles_done); - } - } else { - /* Link */ - - /* Delete trailing anchor */ - strtok(in_line, "#"); - - char **cur_link = bsearch(&in_line[0], title, - titles, sizeof(title[0]), - cmpstring_p_pp); - - if (!cur_link) { - //printf("LINK NOT FOUND: %s\n", in_line); - free(in_line); - continue; - } - - links[i]++; - link[i] = realloc(link[i], links[i] * sizeof(art_id)); - - link[i][links[i] - 1] = cur_link - title; - - //printf("%s -- %d\n", title[i], links[i]); - } - - free(in_line); - } - fclose(in_file); - - printf("Links identified.\n"); - - - - out_file = fopen("titles-sorted.txt", "w"); - for (i = 0; i < titles; i++) { - fputs(title[i], out_file); - fputs("\n", out_file); - } - fclose(out_file); - - printf("Titles written.\n"); - - - - out_file = fopen("links-outgoing.bin", "wb"); - fwrite(&titles, sizeof(titles), 1, out_file); - for (i = 0; i < titles; i++) { - art_id j; - - fwrite(&links[i], sizeof(links[i]), 1, out_file); - - for (j = 0; j < links[i]; j++) { - fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file); - } - } - fclose(out_file); - - printf("Links written.\n"); -} |