From a73a9f7ebe6c82a9210e63700481b0b2dfcb0b4b Mon Sep 17 00:00:00 2001
From: Dirk Engling <erdgeist@erdgeist.org>
Date: Sat, 2 Feb 2019 22:56:02 +0100
Subject: First attempt to sort for unifications

---
 src/postprocess/sort_plz.c | 132 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 src/postprocess/sort_plz.c

(limited to 'src/postprocess')

diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c
new file mode 100644
index 0000000..f44cec7
--- /dev/null
+++ b/src/postprocess/sort_plz.c
@@ -0,0 +1,132 @@
+/*
+  target is in current directory:
+    entries_single/<PLZ> entries_multi/<PLZ>
+    <PLZ> is either [0-9_]{5} or _____ or brken
+  opens files in source directory:
+    01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl
+    11_Ort 12_Vorwahl 13_Rufnummer 16_Koordinaten
+  appends to all of the above dirs plus
+    00_Jahr
+*/
+
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <err.h>
+
+enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_16, F_COUNT };
+
+static char *g_filenames[] = {
+ "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "16_Koordinaten" };
+
+FILE * fopen_prefix(char *prefix, int file_id, int readonly) {
+  char filename[1024];
+  snprintf( filename, sizeof(filename), "%s/%s", prefix, g_filenames[file_id]);
+  return fopen(filename, readonly ? "r" : "a");
+}
+
+int main(int argc, char **args) {
+  FILE * in_handles[F_COUNT] = { NULL };
+  FILE * out_handles[F_COUNT] = { NULL };
+  char flags[4];
+  int i, in_multi = 0;
+  char *input = malloc(1024);
+  size_t input_size = 1024;
+
+  /* First open all input files */
+  for (i=F_01; i<F_COUNT; ++i) {
+    in_handles[i] = fopen_prefix(args[1], i, 1);
+    if (!in_handles[i])
+      errx( 1, "Couldn't open file %s\n", g_filenames[i]);
+  }
+
+  mkdir( "multi", 0755);
+  mkdir( "single", 0755);
+
+  /* Get Flags to check if we're processing a continuation */
+  while (fgets(flags, 4, in_handles[F_01])) {
+    char out_dir[32];
+    ssize_t linelen;
+    char flag = strtoul(flags, 0, 16);
+    char *type = flag & 1 ? "multi/" : "single/";
+
+    /* If we're in multiline mode, we just copy lines as long as we see continuations */
+    if (in_multi) {
+      if (flag & 0x2) {
+        fputs(args[1], out_handles[F_00]);      // write Jahr
+        fputc(10, out_handles[F_00]);
+        fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim
+        for (i=F_10; i<F_COUNT; ++i) {          // process the rest of entries
+          ssize_t linelen = getline(&input, &input_size, in_handles[i]);
+          fwrite(input, linelen, 1, out_handles[i]);
+        }
+        continue;
+      }
+      /* If the entry is not a continuation, close all output files and switch off multi mode */
+      for (i=0; i<F_COUNT; ++i) {
+        fclose(out_handles[i]);
+        out_handles[i] = NULL;
+      }
+      in_multi = 0;
+    }
+
+    if (flag & 0x1)
+      in_multi = 1;
+
+    /* Read Postleitzahl to get destination */
+    linelen = getline(&input, &input_size, in_handles[F_10]);
+    if (linelen && input[linelen - 1] == 10) { // chomp
+      input[linelen - 1] = 0;
+      --linelen;
+    }
+
+    if (linelen == 0) // empty PLZ
+      strcpy(out_dir, in_multi ? "multi/_____" : "single/_____");
+    else if (linelen == 5) { // potentially normal
+      int broken = 0;
+      char * dest = out_dir + sprintf(out_dir, in_multi ? "multi/" : "single/");
+      for (i=0; i<5; ++i) {
+        if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') {
+          broken = 1;
+          break;
+        }
+        dest[i] = input[i];
+        if (dest[i] == '.') dest[i] = '_';
+      }
+      dest[5] = 0;
+      if (broken)
+        strcpy(out_dir, in_multi ? "multi/broken" : "single/broken");
+    } else
+      strcpy(out_dir, in_multi ? "multi/broken" : "single/broken");
+
+    if (mkdir(out_dir, 0755) == -1 && errno != EEXIST)
+      errx( 1, "Couldn't create directory %s %d\n", out_dir, errno);
+
+    for (i=F_00; i<F_COUNT; ++i) {
+      out_handles[i] = fopen_prefix(out_dir, i, 0);
+      if (!out_handles[i])
+        errx( 1, "Couldn't open file %s\n", g_filenames[i]);
+    }
+
+    fputs(args[1], out_handles[F_00]);            // write Jahr
+    fputc(10, out_handles[F_00]);
+    fwrite(flags, 3, 1, out_handles[F_01]);       // copy Flags verbatim
+    fputs(input, out_handles[F_10]); // copy Postleitzahl verbatim
+    fputc(10, out_handles[F_10]);
+
+    for (i=F_02; i<F_COUNT; ++i) {                // process the rest of entries
+      ssize_t linelen = getline(&input, &input_size, in_handles[i]);
+      fwrite(input, linelen, 1, out_handles[i]);
+    }
+
+    if (!in_multi)
+      for (i=0; i<F_COUNT; ++i) {
+        fclose(out_handles[i]);
+        out_handles[i] = NULL;
+      }
+  }
+
+  return 0;
+}
-- 
cgit v1.2.3