Add final state from 2018-10-07
authornorly <ny-git@enpas.org>
Thu, 4 Jul 2019 22:14:06 +0000 (00:14 +0200)
committernorly <ny-git@enpas.org>
Thu, 4 Jul 2019 22:14:06 +0000 (00:14 +0200)
convert-to-plain.sh [new file with mode: 0644]
links-outgoing-to-incoming.c [new file with mode: 0644]
links-plain-to-binary.c [new file with mode: 0644]
links-xml-to-plain.l [new file with mode: 0644]
lookup-incoming.c [new file with mode: 0644]

diff --git a/convert-to-plain.sh b/convert-to-plain.sh
new file mode 100644 (file)
index 0000000..e7bd378
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Compile flex program to regex-convert the XML to plain text
+#https://unix.stackexchange.com/a/413684
+flex -o links-xml-to-plain.c links-xml-to-plain.l
+gcc -O3  -o links-xml-to-plain links-xml-to-plain.c -lfl
+
+# Convert to plain text
+lzop -dc enwiki-links.xml.lzo | pv | tail -c +45 | ./links-xml-to-plain > enwiki-links-plain.txt
+
+# Extract titles
+grep ^~~~~ enwiki-links-plain.txt | sed "s/^~~~~//g;te;d;:e" | sort > titles.txt
+
+
+
+gcc -O3 -g -o links-plain-to-binary links-plain-to-binary.c
+time ./links-plain-to-binary
+
+gcc -O3 -g -o links-outgoing-to-incoming links-outgoing-to-incoming.c
+time ./links-outgoing-to-incoming
diff --git a/links-outgoing-to-incoming.c b/links-outgoing-to-incoming.c
new file mode 100644 (file)
index 0000000..8a23ca7
--- /dev/null
@@ -0,0 +1,89 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+
+int main()
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       size_t titles;
+
+       size_t **linko;
+       size_t *linkos;
+
+       size_t **linki;
+       size_t *linkis;
+
+       size_t link_titles_done = 0;
+
+       size_t i;
+
+
+       /*
+        * Read all outgoing links into memory
+        */
+
+       in_file = fopen("links-outgoing.bin", "rb");
+
+       fread(&titles, sizeof(titles), 1, in_file);
+
+       linko = malloc(titles * sizeof(size_t*));
+       linkos = malloc(titles * sizeof(size_t));
+
+       for (i = 0; i < titles; i++) {
+               size_t j;
+
+               fread(&linkos[i], sizeof(linkos[i]), 1, in_file);
+
+               linko[i] = malloc(linkos[i] * sizeof(linko[i][0]));
+
+               for (j = 0; j < linkos[i]; j++) {
+                       fread(&linko[i][j], sizeof(linko[i][j]), 1, in_file);
+               }
+       }
+       fclose(in_file);
+
+       printf("Outgoing links read.\n");
+
+
+
+       linki = malloc(titles * sizeof(size_t*));
+       linkis = malloc(titles * sizeof(size_t));
+
+       for (i = 0; i < titles; i++) {
+               size_t j;
+
+               for (j = 0; j < linkos[i]; j++) {
+                       size_t x = linko[i][j];
+
+                       linkis[x]++;
+                       linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0]));
+
+                       linki[x][linkis[x] - 1] = i;
+               }
+       }
+
+       printf("Links turned upside down.\n");
+
+
+
+       out_file = fopen("links-incoming.bin", "wb");
+       fwrite(&titles, sizeof(titles), 1, out_file);
+       for (i = 0; i < titles; i++) {
+               size_t j;
+
+               fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file);
+
+               for (j = 0; j < linkis[i]; j++) {
+                       fwrite(&linki[i][j], sizeof(linki[i][j]), 1, out_file);
+               }
+       }
+       fclose(out_file);
+
+       printf("Incoming links written.\n");
+}
diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c
new file mode 100644 (file)
index 0000000..13ded29
--- /dev/null
@@ -0,0 +1,174 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static int
+cmpstring_pp_pp(const void *p1, const void *p2)
+{
+       return strcasecmp(* (char * const *) p1, * (char * const *) p2);
+}
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+       return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main()
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       char **title = NULL;
+       size_t titles = 0;
+       size_t titles_alloc = 0;
+
+       size_t **link;
+       size_t *links;
+
+       size_t link_titles_done = 0;
+
+       size_t i;
+
+
+       /*
+        * Read all titles into memory
+        */
+
+       in_file = fopen("titles.txt", "r");
+       while (!feof(in_file)) {
+               char *in_line = NULL;
+               ssize_t in_line_len = 0;
+               size_t zero = 0;
+
+               if (titles == titles_alloc) {
+                       titles_alloc += 100000;
+                       title = realloc(title, titles_alloc * sizeof(title[0]));
+               }
+
+               in_line_len = getline(&in_line, &zero, in_file);
+
+               /* Ignore empty lines and errors */
+               if (in_line_len < 2) {
+                       continue;
+               }
+
+               /* Delete trailing newline */
+               in_line[in_line_len - 1] = '\0';
+
+               title[titles] = in_line;
+               titles++;
+       }
+       fclose(in_file);
+
+       qsort(title, titles, sizeof(title[0]), cmpstring_pp_pp);
+
+       printf("Sorting done.\n");
+
+
+
+       link = malloc(titles * sizeof(size_t*));
+       links = malloc(titles * sizeof(size_t));
+
+       in_file = fopen("enwiki-links-plain.txt", "r");
+       while (!feof(in_file)) {
+               char *in_line = NULL;
+               ssize_t in_line_len = 0;
+               size_t zero = 0;
+
+               in_line_len = getline(&in_line, &zero, in_file);
+
+               /* Ignore empty lines and errors */
+               if (in_line_len < 2) {
+                       //printf("%d\n", links[i]);
+                       if (in_line)
+                               free(in_line);
+                       continue;
+               }
+
+               /* Delete trailing newline */
+               in_line[in_line_len - 1] = '\0';
+
+               if (in_line_len > 5
+                   && !memcmp(in_line, "~~~~", 4)) {
+                       /* Title */
+                       char **cur_title = bsearch(&in_line[4], title,
+                                               titles, sizeof(title[0]),
+                                               cmpstring_p_pp);
+
+                       if (!cur_title) {
+                               printf("TITLE NOT FOUND: %s\n", in_line);
+                               assert(cur_title);
+                       }
+
+                       i = cur_title - title;
+                       //printf("%s\n", title[i]);
+
+                       link_titles_done++;
+                       if (0 == (link_titles_done % 100000)) {
+                               printf("%d\n", link_titles_done);
+                       }
+               } else {
+                       /* Link */
+
+                       /* Delete trailing anchor */
+                       strtok(in_line, "#");
+
+                       char **cur_link = bsearch(&in_line[0], title,
+                                               titles, sizeof(title[0]),
+                                               cmpstring_p_pp);
+
+                       if (!cur_link) {
+                               //printf("LINK NOT FOUND: %s\n", in_line);
+                               free(in_line);
+                               continue;
+                       }
+
+                       links[i]++;
+                       link[i] = realloc(link[i], links[i] * sizeof(size_t));
+
+                       link[i][links[i] - 1] = cur_link - title;
+
+                       //printf("%s -- %d\n", title[i], links[i]);
+               }
+
+               free(in_line);
+       }
+       fclose(in_file);
+
+       printf("Links identified.\n");
+
+
+
+       out_file = fopen("titles-sorted.txt", "w");
+       for (i = 0; i < titles; i++) {
+               fputs(title[i], out_file);
+               fputs("\n", out_file);
+       }
+       fclose(out_file);
+
+       printf("Titles written.\n");
+
+
+
+       out_file = fopen("links-outgoing.bin", "wb");
+       fwrite(&titles, sizeof(titles), 1, out_file);
+       for (i = 0; i < titles; i++) {
+               size_t j;
+
+               fwrite(&links[i], sizeof(links[i]), 1, out_file);
+
+               for (j = 0; j < links[i]; j++) {
+                       fwrite(&link[i][j], sizeof(link[i][j]), 1, out_file);
+               }
+       }
+       fclose(out_file);
+
+       printf("Links written.\n");
+}
diff --git a/links-xml-to-plain.l b/links-xml-to-plain.l
new file mode 100644 (file)
index 0000000..dabe538
--- /dev/null
@@ -0,0 +1,10 @@
+%%
+\<d\>     ;
+\<\/d\>   ;
+\<p>      ;
+\<\/p\>   printf("\n");
+\<t>      printf("~~~~");
+\<\/t\>   printf("\n");
+\<l>      ;
+\<\/l\>   printf("\n");
+%%
\ No newline at end of file
diff --git a/lookup-incoming.c b/lookup-incoming.c
new file mode 100644 (file)
index 0000000..50f2dbc
--- /dev/null
@@ -0,0 +1,181 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static int
+cmpstring_p_pp(const void *p1, const void *p2)
+{
+       return strcasecmp((char * const) p1, * (char * const *) p2);
+}
+
+
+
+
+int main(int argc, char **argv)
+{
+       FILE *in_file;
+       FILE *out_file;
+
+       char **title = NULL;
+       size_t titles;
+       size_t titles_read = 0;
+
+       size_t **linki;
+       size_t *linkis;
+
+       char **cur_title;
+       size_t title_id;
+
+       size_t *dist_table;
+       size_t cur_dist;
+       int cur_dist_is_not_last;
+
+       size_t i;
+
+
+       if (argc < 2) {
+               printf("Supply article name to look up articles linking to it.\n");
+               return 1;
+       }
+
+
+       /*
+        * Read all incoming links into memory
+        */
+
+       in_file = fopen("links-incoming.bin", "rb");
+
+       fread(&titles, sizeof(titles), 1, in_file);
+
+       linki = malloc(titles * sizeof(size_t*));
+       linkis = malloc(titles * sizeof(size_t));
+
+       for (i = 0; i < titles; i++) {
+               size_t j;
+
+               fread(&linkis[i], sizeof(linkis[i]), 1, in_file);
+               //printf("linkis[%zd] = %zd\n", i, linkis[i]);
+
+               linki[i] = malloc(linkis[i] * sizeof(linki[i][0]));
+
+               j = fread(linki[i], sizeof(linki[i][0]), linkis[i], in_file);
+               assert(j == linkis[i]);
+               //for (j = 0; j < linkis[i]; j++) {
+               //      fread(&linki[i][j], sizeof(linki[i][j]), 1, in_file);
+               //}
+       }
+       printf("Incoming links read (%zd bytes).\n", ftell(in_file));
+       fclose(in_file);
+
+
+
+
+       /*
+        * Read all titles into memory
+        */
+
+       title = malloc(titles * sizeof(title[0]));
+
+       in_file = fopen("titles-sorted.txt", "r");
+       while (!feof(in_file)) {
+               char *in_line = NULL;
+               ssize_t in_line_len = 0;
+               size_t zero = 0;
+
+               in_line_len = getline(&in_line, &zero, in_file);
+
+               /* Ignore empty lines and errors */
+               if (in_line_len < 2) {
+                       continue;
+               }
+
+               /* Delete trailing newline */
+               in_line[in_line_len - 1] = '\0';
+
+               title[titles_read] = in_line;
+               titles_read++;
+       }
+       fclose(in_file);
+
+       printf("Titles read.\n");
+
+
+
+
+       /* Look up article ID */
+       cur_title = bsearch(argv[1], title,
+                               titles, sizeof(title[0]),
+                               cmpstring_p_pp);
+
+       if (!cur_title) {
+               printf("TITLE NOT FOUND: %s\n", argv[1]);
+               return 1;
+       }
+
+       title_id = cur_title - title;
+
+       printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]);
+
+       for (i = 0; i < linkis[title_id]; i++) {
+               size_t x = linki[title_id][i];
+               printf("  %s\n", title[x]);
+       }
+
+
+
+       printf("\n\n\n\nBuilding table of distances...\n\n");
+
+       dist_table = malloc(titles * sizeof(dist_table[0]));
+
+       dist_table[title_id] = 0xdeadbeef;
+
+       for (i = 0; i < linkis[title_id]; i++) {
+               size_t x = linki[title_id][i];
+               dist_table[x] = 1;
+       }
+
+       cur_dist = 1;
+       cur_dist_is_not_last = 1;
+
+       while (cur_dist_is_not_last) {
+               size_t articles_found = 0;
+
+               cur_dist_is_not_last = 0;
+
+               for (i = 0; i < titles; i++) {
+                       if (dist_table[i] == cur_dist) {
+                               size_t j;
+
+                               for (j = 0; j < linkis[i]; j++) {
+                                       size_t x = linki[i][j];
+
+                                       if (!dist_table[x]) {
+                                               dist_table[x] = cur_dist + 1;
+                                       }
+                               }
+
+                               cur_dist_is_not_last = 1;
+
+                               articles_found++;
+                       }
+               }
+
+               printf("Distance %zd: Found %zd articles.\n", cur_dist, articles_found);
+               cur_dist++;
+       }
+
+       cur_dist -= 2;
+
+       printf("\n\n\nThe articles with the distance of %zd are:\n\n", cur_dist);
+       for (i = 0; i < titles; i++) {
+               if (dist_table[i] == cur_dist) {
+                       printf("  %s\n", title[i]);
+               }
+       }
+
+
+       return 0;
+}