From: norly Date: Sun, 14 Jul 2019 12:21:58 +0000 (+0200) Subject: Move to __uint32_t from glibc's bits/types.c X-Git-Url: https://git.enpas.org/?p=enwiki-links-graph.git;a=commitdiff_plain;h=4023e389527f9102603c60dbecd1888d55e6b950 Move to __uint32_t from glibc's bits/types.c This saves 50% of HDD space for the links on 64 bit platforms --- diff --git a/links-outgoing-to-incoming.c b/links-outgoing-to-incoming.c index 8a23ca7..2fd853a 100644 --- a/links-outgoing-to-incoming.c +++ b/links-outgoing-to-incoming.c @@ -3,25 +3,28 @@ #include #include #include +#include +typedef __uint32_t art_id; + int main() { FILE *in_file; FILE *out_file; - size_t titles; + art_id titles; - size_t **linko; - size_t *linkos; + art_id **linko; + art_id *linkos; - size_t **linki; - size_t *linkis; + art_id **linki; + art_id *linkis; - size_t link_titles_done = 0; + art_id link_titles_done = 0; - size_t i; + art_id i; /* @@ -32,11 +35,11 @@ int main() fread(&titles, sizeof(titles), 1, in_file); - linko = malloc(titles * sizeof(size_t*)); - linkos = malloc(titles * sizeof(size_t)); + linko = malloc(titles * sizeof(art_id*)); + linkos = malloc(titles * sizeof(art_id)); for (i = 0; i < titles; i++) { - size_t j; + art_id j; fread(&linkos[i], sizeof(linkos[i]), 1, in_file); @@ -52,14 +55,14 @@ int main() - linki = malloc(titles * sizeof(size_t*)); - linkis = malloc(titles * sizeof(size_t)); + linki = malloc(titles * sizeof(art_id*)); + linkis = malloc(titles * sizeof(art_id)); for (i = 0; i < titles; i++) { - size_t j; + art_id j; for (j = 0; j < linkos[i]; j++) { - size_t x = linko[i][j]; + art_id x = linko[i][j]; linkis[x]++; linki[x] = realloc(linki[x], linkis[x] * sizeof(linki[x][0])); @@ -75,7 +78,7 @@ int main() out_file = fopen("links-incoming.bin", "wb"); fwrite(&titles, sizeof(titles), 1, out_file); for (i = 0; i < titles; i++) { - size_t j; + art_id j; fwrite(&linkis[i], sizeof(linkis[i]), 1, out_file); diff --git a/links-plain-to-binary.c b/links-plain-to-binary.c index 13ded29..005a496 100644 --- a/links-plain-to-binary.c +++ b/links-plain-to-binary.c @@ -3,6 +3,10 @@ #include #include #include +#include + + +typedef __uint32_t art_id; static int @@ -26,15 +30,15 @@ int main() FILE *out_file; char **title = NULL; - size_t titles = 0; - size_t titles_alloc = 0; + art_id titles = 0; + art_id titles_alloc = 0; - size_t **link; - size_t *links; + art_id **link; + art_id *links; - size_t link_titles_done = 0; + art_id link_titles_done = 0; - size_t i; + art_id i; /* @@ -73,8 +77,8 @@ int main() - link = malloc(titles * sizeof(size_t*)); - links = malloc(titles * sizeof(size_t)); + link = malloc(titles * sizeof(art_id*)); + links = malloc(titles * sizeof(art_id)); in_file = fopen("enwiki-links-plain.txt", "r"); while (!feof(in_file)) { @@ -131,7 +135,7 @@ int main() } links[i]++; - link[i] = realloc(link[i], links[i] * sizeof(size_t)); + link[i] = realloc(link[i], links[i] * sizeof(art_id)); link[i][links[i] - 1] = cur_link - title; @@ -160,7 +164,7 @@ int main() out_file = fopen("links-outgoing.bin", "wb"); fwrite(&titles, sizeof(titles), 1, out_file); for (i = 0; i < titles; i++) { - size_t j; + art_id j; fwrite(&links[i], sizeof(links[i]), 1, out_file); diff --git a/lookup-incoming.c b/lookup-incoming.c index 50f2dbc..d1467f8 100644 --- a/lookup-incoming.c +++ b/lookup-incoming.c @@ -3,6 +3,10 @@ #include #include #include +#include + + +typedef __uint32_t art_id; static int @@ -20,20 +24,20 @@ int main(int argc, char **argv) FILE *out_file; char **title = NULL; - size_t titles; - size_t titles_read = 0; + art_id titles; + art_id titles_read = 0; - size_t **linki; - size_t *linkis; + art_id **linki; + art_id *linkis; char **cur_title; - size_t title_id; + art_id title_id; - size_t *dist_table; - size_t cur_dist; + art_id *dist_table; + art_id cur_dist; int cur_dist_is_not_last; - size_t i; + art_id i; if (argc < 2) { @@ -50,11 +54,11 @@ int main(int argc, char **argv) fread(&titles, sizeof(titles), 1, in_file); - linki = malloc(titles * sizeof(size_t*)); - linkis = malloc(titles * sizeof(size_t)); + linki = malloc(titles * sizeof(art_id*)); + linkis = malloc(titles * sizeof(art_id)); for (i = 0; i < titles; i++) { - size_t j; + art_id j; fread(&linkis[i], sizeof(linkis[i]), 1, in_file); //printf("linkis[%zd] = %zd\n", i, linkis[i]); @@ -120,7 +124,7 @@ int main(int argc, char **argv) printf("Article %zd (%s) is linked from %zd articles:\n", title_id, title[title_id], linkis[title_id]); for (i = 0; i < linkis[title_id]; i++) { - size_t x = linki[title_id][i]; + art_id x = linki[title_id][i]; printf(" %s\n", title[x]); } @@ -133,7 +137,7 @@ int main(int argc, char **argv) dist_table[title_id] = 0xdeadbeef; for (i = 0; i < linkis[title_id]; i++) { - size_t x = linki[title_id][i]; + art_id x = linki[title_id][i]; dist_table[x] = 1; } @@ -141,16 +145,16 @@ int main(int argc, char **argv) cur_dist_is_not_last = 1; while (cur_dist_is_not_last) { - size_t articles_found = 0; + art_id articles_found = 0; cur_dist_is_not_last = 0; for (i = 0; i < titles; i++) { if (dist_table[i] == cur_dist) { - size_t j; + art_id j; for (j = 0; j < linkis[i]; j++) { - size_t x = linki[i][j]; + art_id x = linki[i][j]; if (!dist_table[x]) { dist_table[x] = cur_dist + 1;