summaryrefslogtreecommitdiff
path: root/links-plain-to-binary.c
diff options
context:
space:
mode:
authornorly <ny-git@enpas.org>2019-07-14 17:28:47 +0200
committernorly <ny-git@enpas.org>2019-07-14 17:28:47 +0200
commitf0f54296b5b445c6ce0e47486bcdcb0deca582ff (patch)
tree35c3858bab40f4bf8f6c57e2d5522f17d2928511 /links-plain-to-binary.c
parent64907e38005ada5b2a545ae58f05d0fd616ffa79 (diff)
Move to Makefile and .gitignore
Diffstat (limited to 'links-plain-to-binary.c')
-rw-r--r--links-plain-to-binary.c178
1 files changed, 0 insertions, 178 deletions
diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c
deleted file mode 100644
index 005a496..0000000
--- a/links-plain-to-binary.c
+++ /dev/null
@@ -1,178 +0,0 @@
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-
-
-typedef __uint32_t art_id;
-
-
-static int
-cmpstring_pp_pp(const void *p1, const void *p2)
-{
- return strcasecmp(* (char * const *) p1, * (char * const *) p2);
-}
-
-static int
-cmpstring_p_pp(const void *p1, const void *p2)
-{
- return strcasecmp((char * const) p1, * (char * const *) p2);
-}
-
-
-
-
-int main()
-{
- FILE *in_file;
- FILE *out_file;
-
- char **title = NULL;
- art_id titles = 0;
- art_id titles_alloc = 0;
-
- art_id **link;
- art_id *links;
-
- art_id link_titles_done = 0;
-
- art_id i;
-
-
- /*
- * Read all titles into memory
- */
-
- in_file = fopen("titles.txt", "r");
- while (!feof(in_file)) {
- char *in_line = NULL;
- ssize_t in_line_len = 0;
- size_t zero = 0;
-
- if (titles == titles_alloc) {
- titles_alloc += 100000;
- title = realloc(title, titles_alloc * sizeof(title[0]));
- }
-
- in_line_len = getline(&in_line, &zero, in_file);
-
- /* Ignore empty lines and errors */
- if (in_line_len < 2) {
- continue;
- }
-
- /* Delete trailing newline */
- in_line[in_line_len - 1] = '\0';
-
- title[titles] = in_line;
- titles++;
- }
- fclose(in_file);
-
- qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
-
- printf("Sorting done.\n");
-
-
-
- link = malloc(titles * sizeof(art_id*));
- links = malloc(titles * sizeof(art_id));
-
- in_file = fopen("enwiki-links-plain.txt", "r");
- while (!feof(in_file)) {
- char *in_line = NULL;
- ssize_t in_line_len = 0;
- size_t zero = 0;
-
- in_line_len = getline(&in_line, &zero, in_file);
-
- /* Ignore empty lines and errors */
- if (in_line_len < 2) {
- //printf("%d\n", links[i]);
- if (in_line)
- free(in_line);
- continue;
- }
-
- /* Delete trailing newline */
- in_line[in_line_len - 1] = '\0';
-
- if (in_line_len > 5
- && !memcmp(in_line, "~~~~", 4)) {
- /* Title */
- char **cur_title = bsearch(&in_line[4], title,
- titles, sizeof(title[0]),
- cmpstring_p_pp);
-
- if (!cur_title) {
- printf("TITLE NOT FOUND: %s\n", in_line);
- assert(cur_title);
- }
-
- i = cur_title - title;
- //printf("%s\n", title[i]);
-
- link_titles_done++;
- if (0 == (link_titles_done % 100000)) {
- printf("%d\n", link_titles_done);
- }
- } else {
- /* Link */
-
- /* Delete trailing anchor */
- strtok(in_line, "#");
-
- char **cur_link = bsearch(&in_line[0], title,
- titles, sizeof(title[0]),
- cmpstring_p_pp);
-
- if (!cur_link) {
- //printf("LINK NOT FOUND: %s\n", in_line);
- free(in_line);
- continue;
- }
-
- links[i]++;
- link[i] = realloc(link[i], links[i] * sizeof(art_id));
-
- link[i][links[i] - 1] = cur_link - title;
-
- //printf("%s -- %d\n", title[i], links[i]);
- }
-
- free(in_line);
- }
- fclose(in_file);
-
- printf("Links identified.\n");
-
-
-
- out_file = fopen("titles-sorted.txt", "w");
- for (i = 0; i < titles; i++) {
- fputs(title[i], out_file);
- fputs("\n", out_file);
- }
- fclose(out_file);
-
- printf("Titles written.\n");
-
-
-
- out_file = fopen("links-outgoing.bin", "wb");
- fwrite(&titles, sizeof(titles), 1, out_file);
- for (i = 0; i < titles; i++) {
- art_id j;
-
- fwrite(&links[i], sizeof(links[i]), 1, out_file);
-
- for (j = 0; j < links[i]; j++) {
- fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
- }
- }
- fclose(out_file);
-
- printf("Links written.\n");
-}