From 001acb04b78393acbd69d02a9a4d70f878d921d4 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Mon, 4 Feb 2019 22:30:00 +0100 Subject: Make sort_plz work on single files instead of creating all the columns --- src/postprocess/sort_plz.c | 121 ++++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 45 deletions(-) diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c index f44cec7..26ac9d0 100644 --- a/src/postprocess/sort_plz.c +++ b/src/postprocess/sort_plz.c @@ -4,7 +4,7 @@ is either [0-9_]{5} or _____ or brken opens files in source directory: 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl - 11_Ort 12_Vorwahl 13_Rufnummer 16_Koordinaten + 11_Ort 12_Vorwahl 13_Rufnummer 14_15_Email_Webadresse 16_Koordinaten appends to all of the above dirs plus 00_Jahr */ @@ -14,12 +14,21 @@ #include #include #include +#include #include -enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_16, F_COUNT }; +enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_16, F_COUNT }; static char *g_filenames[] = { - "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "16_Koordinaten" }; + "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_15_Email_Webadresse", "16_Koordinaten" }; + +typedef struct { + char plz[8]; + FILE * file; +} outhandle; + +static outhandle g_outhandles[32*1024]; +static int g_outhandle_count; FILE * fopen_prefix(char *prefix, int file_id, int readonly) { char filename[1024]; @@ -27,10 +36,50 @@ FILE * fopen_prefix(char *prefix, int file_id, int readonly) { return fopen(filename, readonly ? "r" : "a"); } +/* This function gives us a binary search that returns a pointer, even if + no exact match is found. In that case it sets exactmatch 0 and gives + calling functions the chance to insert data +*/ +void *binary_search( const void * const key, const void * base, const size_t member_count, const size_t member_size, + size_t compare_size, int *exactmatch ) { + size_t interval = member_count; + + while( interval ) { + uint8_t *lookat = ((uint8_t*)base) + member_size * ( interval / 2 ); + int cmp = memcmp( lookat, key, compare_size ); + if(cmp == 0 ) { + base = lookat; + break; + } + if(cmp < 0) { + base = lookat + member_size; + interval--; + } + interval /= 2; + } + + *exactmatch = interval; + return (void*)base; +} + +FILE * get_file_for_postleitzahl(char *plz) { + int exactmatch = 0; + outhandle * oh = (outhandle *)binary_search(plz, g_outhandles, g_outhandle_count, sizeof(outhandle), 5, &exactmatch); + if (!exactmatch) { + size_t s = (g_outhandles + g_outhandle_count) - oh; + memmove(oh + 1, oh, s * sizeof(outhandle)); + oh->file = fopen(plz, "a"); + if (!oh->file) errx( 1, "Couldn't open file %s for writing\n", plz); + memcpy(oh->plz, plz, 5); + g_outhandle_count++; + } + return oh->file; +} + int main(int argc, char **args) { FILE * in_handles[F_COUNT] = { NULL }; - FILE * out_handles[F_COUNT] = { NULL }; - char flags[4]; + FILE * out_handle = NULL; + char flags[4], outfile[6]; int i, in_multi = 0; char *input = malloc(1024); size_t input_size = 1024; @@ -42,33 +91,26 @@ int main(int argc, char **args) { errx( 1, "Couldn't open file %s\n", g_filenames[i]); } - mkdir( "multi", 0755); - mkdir( "single", 0755); + mkdir("output", 0755); + chdir("output"); /* Get Flags to check if we're processing a continuation */ while (fgets(flags, 4, in_handles[F_01])) { - char out_dir[32]; ssize_t linelen; char flag = strtoul(flags, 0, 16); - char *type = flag & 1 ? "multi/" : "single/"; /* If we're in multiline mode, we just copy lines as long as we see continuations */ if (in_multi) { if (flag & 0x2) { - fputs(args[1], out_handles[F_00]); // write Jahr - fputc(10, out_handles[F_00]); - fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim + fputs(args[1], out_handle); // write Jahr + fputc(10, out_handle); + fwrite(flags, 3, 1, out_handle); // copy Flags verbatim for (i=F_10; i '9') && input[i] != '.') { broken = 1; break; } - dest[i] = input[i]; - if (dest[i] == '.') dest[i] = '_'; + outfile[i] = input[i]; + if (outfile[i] == '.') outfile[i] = '_'; } - dest[5] = 0; + outfile[5] = 0; if (broken) - strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); + strcpy(outfile, "brken"); } else - strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); + strcpy(outfile, "brken"); - if (mkdir(out_dir, 0755) == -1 && errno != EEXIST) - errx( 1, "Couldn't create directory %s %d\n", out_dir, errno); + out_handle = get_file_for_postleitzahl(outfile); - for (i=F_00; i