From d65a56c2382d31ede44a10cfe559f744660dbc99 Mon Sep 17 00:00:00 2001 From: norly Date: Fri, 5 Jul 2019 00:14:06 +0200 Subject: [PATCH] Add final state from 2018-10-07 --- convert-to-plain.sh | 20 ++++ links-outgoing-to-incoming.c | 89 +++++++++++++++++ links-plain-to-binary.c | 174 +++++++++++++++++++++++++++++++++ links-xml-to-plain.l | 10 ++ lookup-incoming.c | 181 +++++++++++++++++++++++++++++++++++ 5 files changed, 474 insertions(+) create mode 100644 convert-to-plain.sh create mode 100644 links-outgoing-to-incoming.c create mode 100644 links-plain-to-binary.c create mode 100644 links-xml-to-plain.l create mode 100644 lookup-incoming.c diff --git a/convert-to-plain.sh b/convert-to-plain.sh new file mode 100644 index 0000000..e7bd378 --- /dev/null +++ b/convert-to-plain.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Compile flex program to regex-convert the XML to plain text +#https://unix.stackexchange.com/a/413684 +flex -o links-xml-to-plain.c links-xml-to-plain.l +gcc -O3 -o links-xml-to-plain links-xml-to-plain.c -lfl + +# Convert to plain text +lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt + +# Extract titles +grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt + + + +gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c +time ./links-plain-to-binary + +gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c +time ./links-outgoing-to-incoming diff --git a/links-outgoing-to-incoming.c b/links-outgoing-to-incoming.c new file mode 100644 index 0000000..8a23ca7 --- /dev/null +++ b/links-outgoing-to-incoming.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include + + + +int main() +{ + FILE *in_file; + FILE *out_file; + + size_t titles; + + size_t **linko; + size_t *linkos; + + size_t **linki; + size_t *linkis; + + size_t link_titles_done = 0; + + size_t i; + + + /* + * Read all outgoing links into memory + */ + + in_file = fopen("links-outgoing.bin", "rb"); + + fread(&titles, sizeof(titles), 1, in_file); + + linko = malloc(titles * sizeof(size_t*)); + linkos = malloc(titles * sizeof(size_t)); + + for (i = 0; i < titles; i++) { + size_t j; + + fread(&linkos[i], sizeof(linkos[i]), 1, in_file); + + linko[i] = malloc(linkos[i] * sizeof(linko[i][0])); + + for (j = 0; j < linkos[i]; j++) { + fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file); + } + } + fclose(in_file); + + printf("Outgoing links read.\n"); + + + + linki = malloc(titles * sizeof(size_t*)); + linkis = malloc(titles * sizeof(size_t)); + + for (i = 0; i < titles; i++) { + size_t j; + + for (j = 0; j < linkos[i]; j++) { + size_t x = linko[i][j]; + + linkis[x]++; + linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0])); + + linki[x][linkis[x] - 1] = i; + } + } + + printf("Links turned upside down.\n"); + + + + out_file = fopen("links-incoming.bin", "wb"); + fwrite(&titles, sizeof(titles), 1, out_file); + for (i = 0; i < titles; i++) { + size_t j; + + fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file); + + for (j = 0; j < linkis[i]; j++) { + fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file); + } + } + fclose(out_file); + + printf("Incoming links written.\n"); +} diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c new file mode 100644 index 0000000..13ded29 --- /dev/null +++ b/links-plain-to-binary.c @@ -0,0 +1,174 @@ +#include +#include +#include +#include +#include + + +static int +cmpstring_pp_pp(const void *p1, const void *p2) +{ + return strcasecmp(* (char * const *) p1, * (char * const *) p2); +} + +static int +cmpstring_p_pp(const void *p1, const void *p2) +{ + return strcasecmp((char * const) p1, * (char * const *) p2); +} + + + + +int main() +{ + FILE *in_file; + FILE *out_file; + + char **title = NULL; + size_t titles = 0; + size_t titles_alloc = 0; + + size_t **link; + size_t *links; + + size_t link_titles_done = 0; + + size_t i; + + + /* + * Read all titles into memory + */ + + in_file = fopen("titles.txt", "r"); + while (!feof(in_file)) { + char *in_line = NULL; + ssize_t in_line_len = 0; + size_t zero = 0; + + if (titles == titles_alloc) { + titles_alloc += 100000; + title = realloc(title, titles_alloc * sizeof(title[0])); + } + + in_line_len = getline(&in_line, &zero, in_file); + + /* Ignore empty lines and errors */ + if (in_line_len < 2) { + continue; + } + + /* Delete trailing newline */ + in_line[in_line_len - 1] = '\0'; + + title[titles] = in_line; + titles++; + } + fclose(in_file); + + qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp); + + printf("Sorting done.\n"); + + + + link = malloc(titles * sizeof(size_t*)); + links = malloc(titles * sizeof(size_t)); + + in_file = fopen("enwiki-links-plain.txt", "r"); + while (!feof(in_file)) { + char *in_line = NULL; + ssize_t in_line_len = 0; + size_t zero = 0; + + in_line_len = getline(&in_line, &zero, in_file); + + /* Ignore empty lines and errors */ + if (in_line_len < 2) { + //printf("%d\n", links[i]); + if (in_line) + free(in_line); + continue; + } + + /* Delete trailing newline */ + in_line[in_line_len - 1] = '\0'; + + if (in_line_len > 5 + && !memcmp(in_line, "~~~~", 4)) { + /* Title */ + char **cur_title = bsearch(&in_line[4], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_title) { + printf("TITLE NOT FOUND: %s\n", in_line); + assert(cur_title); + } + + i = cur_title - title; + //printf("%s\n", title[i]); + + link_titles_done++; + if (0 == (link_titles_done % 100000)) { + printf("%d\n", link_titles_done); + } + } else { + /* Link */ + + /* Delete trailing anchor */ + strtok(in_line, "#"); + + char **cur_link = bsearch(&in_line[0], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_link) { + //printf("LINK NOT FOUND: %s\n", in_line); + free(in_line); + continue; + } + + links[i]++; + link[i] = realloc(link[i], links[i] * sizeof(size_t)); + + link[i][links[i] - 1] = cur_link - title; + + //printf("%s -- %d\n", title[i], links[i]); + } + + free(in_line); + } + fclose(in_file); + + printf("Links identified.\n"); + + + + out_file = fopen("titles-sorted.txt", "w"); + for (i = 0; i < titles; i++) { + fputs(title[i], out_file); + fputs("\n", out_file); + } + fclose(out_file); + + printf("Titles written.\n"); + + + + out_file = fopen("links-outgoing.bin", "wb"); + fwrite(&titles, sizeof(titles), 1, out_file); + for (i = 0; i < titles; i++) { + size_t j; + + fwrite(&links[i], sizeof(links[i]), 1, out_file); + + for (j = 0; j < links[i]; j++) { + fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file); + } + } + fclose(out_file); + + printf("Links written.\n"); +} diff --git a/links-xml-to-plain.l b/links-xml-to-plain.l new file mode 100644 index 0000000..dabe538 --- /dev/null +++ b/links-xml-to-plain.l @@ -0,0 +1,10 @@ +%% +\ ; +\<\/d\> ; +\

; +\<\/p\> printf("\n"); +\ printf("~~~~"); +\<\/t\> printf("\n"); +\ ; +\<\/l\> printf("\n"); +%% \ No newline at end of file diff --git a/lookup-incoming.c b/lookup-incoming.c new file mode 100644 index 0000000..50f2dbc --- /dev/null +++ b/lookup-incoming.c @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include + + +static int +cmpstring_p_pp(const void *p1, const void *p2) +{ + return strcasecmp((char * const) p1, * (char * const *) p2); +} + + + + +int main(int argc, char **argv) +{ + FILE *in_file; + FILE *out_file; + + char **title = NULL; + size_t titles; + size_t titles_read = 0; + + size_t **linki; + size_t *linkis; + + char **cur_title; + size_t title_id; + + size_t *dist_table; + size_t cur_dist; + int cur_dist_is_not_last; + + size_t i; + + + if (argc < 2) { + printf("Supply article name to look up articles linking to it.\n"); + return 1; + } + + + /* + * Read all incoming links into memory + */ + + in_file = fopen("links-incoming.bin", "rb"); + + fread(&titles, sizeof(titles), 1, in_file); + + linki = malloc(titles * sizeof(size_t*)); + linkis = malloc(titles * sizeof(size_t)); + + for (i = 0; i < titles; i++) { + size_t j; + + fread(&linkis[i], sizeof(linkis[i]), 1, in_file); + //printf("linkis[%zd] = %zd\n", i, linkis[i]); + + linki[i] = malloc(linkis[i] * sizeof(linki[i][0])); + + j = fread(linki[i], sizeof(linki[i][0]), linkis[i], in_file); + assert(j == linkis[i]); + //for (j = 0; j < linkis[i]; j++) { + // fread(&linki[i][j], sizeof(linki[i][j]), 1, in_file); + //} + } + printf("Incoming links read (%zd bytes).\n", ftell(in_file)); + fclose(in_file); + + + + + /* + * Read all titles into memory + */ + + title = malloc(titles * sizeof(title[0])); + + in_file = fopen("titles-sorted.txt", "r"); + while (!feof(in_file)) { + char *in_line = NULL; + ssize_t in_line_len = 0; + size_t zero = 0; + + in_line_len = getline(&in_line, &zero, in_file); + + /* Ignore empty lines and errors */ + if (in_line_len < 2) { + continue; + } + + /* Delete trailing newline */ + in_line[in_line_len - 1] = '\0'; + + title[titles_read] = in_line; + titles_read++; + } + fclose(in_file); + + printf("Titles read.\n"); + + + + + /* Look up article ID */ + cur_title = bsearch(argv[1], title, + titles, sizeof(title[0]), + cmpstring_p_pp); + + if (!cur_title) { + printf("TITLE NOT FOUND: %s\n", argv[1]); + return 1; + } + + title_id = cur_title - title; + + printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]); + + for (i = 0; i < linkis[title_id]; i++) { + size_t x = linki[title_id][i]; + printf(" %s\n", title[x]); + } + + + + printf("\n\n\n\nBuilding table of distances...\n\n"); + + dist_table = malloc(titles * sizeof(dist_table[0])); + + dist_table[title_id] = 0xdeadbeef; + + for (i = 0; i < linkis[title_id]; i++) { + size_t x = linki[title_id][i]; + dist_table[x] = 1; + } + + cur_dist = 1; + cur_dist_is_not_last = 1; + + while (cur_dist_is_not_last) { + size_t articles_found = 0; + + cur_dist_is_not_last = 0; + + for (i = 0; i < titles; i++) { + if (dist_table[i] == cur_dist) { + size_t j; + + for (j = 0; j < linkis[i]; j++) { + size_t x = linki[i][j]; + + if (!dist_table[x]) { + dist_table[x] = cur_dist + 1; + } + } + + cur_dist_is_not_last = 1; + + articles_found++; + } + } + + printf("Distance %zd: Found %zd articles.\n", cur_dist, articles_found); + cur_dist++; + } + + cur_dist -= 2; + + printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist); + for (i = 0; i < titles; i++) { + if (dist_table[i] == cur_dist) { + printf(" %s\n", title[i]); + } + } + + + return 0; +} -- 2.30.2