From a73a9f7ebe6c82a9210e63700481b0b2dfcb0b4b Mon Sep 17 00:00:00 2001 From: Dirk Engling <erdgeist@erdgeist.org> Date: Sat, 2 Feb 2019 22:56:02 +0100 Subject: First attempt to sort for unifications --- src/postprocess/sort_plz.c | 132 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 src/postprocess/sort_plz.c (limited to 'src/postprocess') diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c new file mode 100644 index 0000000..f44cec7 --- /dev/null +++ b/src/postprocess/sort_plz.c @@ -0,0 +1,132 @@ +/* + target is in current directory: + entries_single/<PLZ> entries_multi/<PLZ> + <PLZ> is either [0-9_]{5} or _____ or brken + opens files in source directory: + 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl + 11_Ort 12_Vorwahl 13_Rufnummer 16_Koordinaten + appends to all of the above dirs plus + 00_Jahr +*/ + +#include <sys/stat.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <err.h> + +enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_16, F_COUNT }; + +static char *g_filenames[] = { + "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "16_Koordinaten" }; + +FILE * fopen_prefix(char *prefix, int file_id, int readonly) { + char filename[1024]; + snprintf( filename, sizeof(filename), "%s/%s", prefix, g_filenames[file_id]); + return fopen(filename, readonly ? "r" : "a"); +} + +int main(int argc, char **args) { + FILE * in_handles[F_COUNT] = { NULL }; + FILE * out_handles[F_COUNT] = { NULL }; + char flags[4]; + int i, in_multi = 0; + char *input = malloc(1024); + size_t input_size = 1024; + + /* First open all input files */ + for (i=F_01; i<F_COUNT; ++i) { + in_handles[i] = fopen_prefix(args[1], i, 1); + if (!in_handles[i]) + errx( 1, "Couldn't open file %s\n", g_filenames[i]); + } + + mkdir( "multi", 0755); + mkdir( "single", 0755); + + /* Get Flags to check if we're processing a continuation */ + while (fgets(flags, 4, in_handles[F_01])) { + char out_dir[32]; + ssize_t linelen; + char flag = strtoul(flags, 0, 16); + char *type = flag & 1 ? "multi/" : "single/"; + + /* If we're in multiline mode, we just copy lines as long as we see continuations */ + if (in_multi) { + if (flag & 0x2) { + fputs(args[1], out_handles[F_00]); // write Jahr + fputc(10, out_handles[F_00]); + fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim + for (i=F_10; i<F_COUNT; ++i) { // process the rest of entries + ssize_t linelen = getline(&input, &input_size, in_handles[i]); + fwrite(input, linelen, 1, out_handles[i]); + } + continue; + } + /* If the entry is not a continuation, close all output files and switch off multi mode */ + for (i=0; i<F_COUNT; ++i) { + fclose(out_handles[i]); + out_handles[i] = NULL; + } + in_multi = 0; + } + + if (flag & 0x1) + in_multi = 1; + + /* Read Postleitzahl to get destination */ + linelen = getline(&input, &input_size, in_handles[F_10]); + if (linelen && input[linelen - 1] == 10) { // chomp + input[linelen - 1] = 0; + --linelen; + } + + if (linelen == 0) // empty PLZ + strcpy(out_dir, in_multi ? "multi/_____" : "single/_____"); + else if (linelen == 5) { // potentially normal + int broken = 0; + char * dest = out_dir + sprintf(out_dir, in_multi ? "multi/" : "single/"); + for (i=0; i<5; ++i) { + if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') { + broken = 1; + break; + } + dest[i] = input[i]; + if (dest[i] == '.') dest[i] = '_'; + } + dest[5] = 0; + if (broken) + strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); + } else + strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); + + if (mkdir(out_dir, 0755) == -1 && errno != EEXIST) + errx( 1, "Couldn't create directory %s %d\n", out_dir, errno); + + for (i=F_00; i<F_COUNT; ++i) { + out_handles[i] = fopen_prefix(out_dir, i, 0); + if (!out_handles[i]) + errx( 1, "Couldn't open file %s\n", g_filenames[i]); + } + + fputs(args[1], out_handles[F_00]); // write Jahr + fputc(10, out_handles[F_00]); + fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim + fputs(input, out_handles[F_10]); // copy Postleitzahl verbatim + fputc(10, out_handles[F_10]); + + for (i=F_02; i<F_COUNT; ++i) { // process the rest of entries + ssize_t linelen = getline(&input, &input_size, in_handles[i]); + fwrite(input, linelen, 1, out_handles[i]); + } + + if (!in_multi) + for (i=0; i<F_COUNT; ++i) { + fclose(out_handles[i]); + out_handles[i] = NULL; + } + } + + return 0; +} -- cgit v1.2.3