From 602c32207ac0085489dcdd436a6866d35455fc89 Mon Sep 17 00:00:00 2001 From: Dirk Engling Date: Sat, 23 Feb 2019 23:27:40 +0100 Subject: Add code for database import --- src/postprocess/merge_entries.c | 313 ++++++++++++++++++++++++++++++++++++++++ src/postprocess/postgres.txt | 41 ++++++ src/postprocess/postprocess.sh | 18 +++ src/postprocess/sort_plz.c | 4 +- 4 files changed, 374 insertions(+), 2 deletions(-) create mode 100644 src/postprocess/merge_entries.c create mode 100644 src/postprocess/postgres.txt create mode 100644 src/postprocess/postprocess.sh (limited to 'src/postprocess') diff --git a/src/postprocess/merge_entries.c b/src/postprocess/merge_entries.c new file mode 100644 index 0000000..3ebfa8c --- /dev/null +++ b/src/postprocess/merge_entries.c @@ -0,0 +1,313 @@ +#include "mystdlib.h" + +#include +#include +#include +#include +#include + +enum { COLUMNS = 15 }; +typedef struct { + char *ptr; + long rows; + long outoff; + long flag; +} entry_t; +typedef struct { + char *ptr; + size_t size; +} outvec_t; + +const char *g_year_map[] = { +"1992_Q2", "1995_Q0", "1996_Q0", "1996_Q1", "1997_Q1", "1997_Q3", "1998_Q1", "1998_Q3", "1999_Q1", "1999_Q3", "2000_Q1", "2000_Q3", "2001_Q1", "2001_Q2", "2001_Q3", "2001_Q4", "2002_Q1", +"2002_Q3", "2003_Q1", "2003_Q3", "2004_Q1", "2004_Q3", "2005_Q1", "2005_Q3", "2006_Q1", "2006_Q3", "2007_Q1", "2007_Q3", "2008_Q1", "2008_Q3", "2009_Q1", "2009_Q3", "2010_Q1", "2010_Q3", +"2011_Q1", "2011_Q3", "2012_Q1", "2012_Q3", "2013_Q1", "2013_Q3", "2014_Q1", "2014_Q3", "2015_Q1", "2015_Q3", "2016_Q1", "2016_Q3", "2017_Q1", "2017_Q3", "2018_Q1", "2018_Q3", "2019_Q1", +0 +}; + +void SKIP_1_COLUMN(char **ptr) { *ptr = strchr(*ptr, 10) + 1; } +void SKIP_2_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } +void SKIP_3_COLUMNS(char **ptr) { SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); SKIP_1_COLUMN(ptr); } + +int year_to_offset(const char *year) { + const char **y = g_year_map; + int off = 0; + while (*y) { + if (!memcmp(year, *y, 7)) return off; + ++off; ++y; + } + return -1; +} + + +int +STRCMP_n (const char *p1, const char *p2) +{ + const unsigned char *s1 = (const unsigned char *) p1; + const unsigned char *s2 = (const unsigned char *) p2; + unsigned char c1, c2; + do + { + c1 = (unsigned char) *s1++; + c2 = (unsigned char) *s2++; + if (c1 == 10) + return c1 - c2; + } + while (c1 == c2); + return c1 - c2; +} + +int compare_entries(entry_t*a, entry_t*b, int *prec) { + char *pa = a->ptr, *pb = b->ptr; + int col, row, res = 0, nprec = -1; + + /* Multi line entries never match single line entries */ + if (a->rows != b->rows) + return -1; + + /* Assume house number precision first .. unless */ + if (!memcmp(pa,"2006_Q3",7)) + *prec = 2; + else + *prec = 3; + + if (!memcmp(pb,"2006_Q3",7)) + nprec = 2; + else + nprec = 3; + + /* Skip year and flags */ + SKIP_2_COLUMNS(&pa); + SKIP_2_COLUMNS(&pb); + + /* Test all columns for identity */ + for (col=2; colrows; ++row) { + + /* Skip last row's coordinate columns, year and flags */ + SKIP_3_COLUMNS(&pa); + SKIP_3_COLUMNS(&pb); + + for (col=2; colptr; + char * pb = (char*)e_b->ptr; + + int results[COLUMNS], c; + + if (e_a->rows != e_b->rows) + return e_a->rows - e_b->rows; + + for (c = 0; csize; ++i) + if (tbuch->addr[i] == 10) + ++lines; + + sort_array = (entry_t*)malloc((lines / COLUMNS) * sizeof(entry_t)); + out_array = (outvec_t*)malloc((lines / COLUMNS) * sizeof(outvec_t)); + + ptr = (char*)tbuch->addr; + start = ptr; + + while (ptr < (char*)tbuch->addr + tbuch->size) { + int c; + + start = ptr; + + /* Look for field terminator */ + for (c=0; c= 0); + sort_array[current].rows++; + } else { + sort_array[++current].ptr = start; + sort_array[current].rows = 0; + sort_array[current].outoff = outoff; + sort_array[current].flag = flag; + } + out_array[outoff].size = ptr - out_array[outoff].ptr; + outoff++; + } + + /* Sort the whole thing */ + qsort(sort_array, current, sizeof(entry_t), sort_me); + + for (i=0; i<=current; ++i) { + int j, dump = 0, prec; + + int year = year_to_offset(sort_array[i].ptr); + + year_list |= 1LL << year; + if (sort_array[i].flag & 0x80 ) bizflag_list |= 1LL << year; + if (sort_array[i].flag & 0x40 ) revflag_list |= 1LL << year; + + /* The last entry always needs to be dumped, but check if its + precision is better than the old truth's + The second comparision checks for equality of entries (modulo + coordinate mismatch) + */ + if (i == current) { + compare_entries(sort_array+i, sort_array+i, &prec); + dump = 1; + } else if (compare_entries(sort_array+i, sort_array+i+1, &prec)) + dump = 1; + + /* If this entry's precision is higher than the one of possible + earlier matches, then the current entry becomes the truth */ + if (prec >= truth_prec) { + truth = i; + truth_prec = prec; + } + + if (dump) { + printf("%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t", year_list, bizflag_list, revflag_list); + for (int c=0; cptr, 10); + size_t len = s - out->ptr; + if (!len || out->ptr[0] == 9) + skipped++; + else { + if (!started++) + putchar('{'); + else + putchar(','); + for (int x=0; xptr, len); + else { + char coords[64], *tab; +// memcpy(coords, "POINT(", 6); +// memcpy(coords + 6, out->ptr, len); +// tab = memchr(coords + 6, 9, len); +// if (tab) *tab = ' '; +// coords[6+len] = ')'; +// fwrite(coords, 7 + len, 1, stdout); + memcpy(coords, out->ptr, len); + tab = memchr(coords, 9, len); + if (tab) *tab = ' '; + fwrite(coords, len, 1, stdout); + } + skipped = 0; + } + out->ptr = s + 1; + ++out; + } + if (started) putchar('}'); + if (c&2; ../merge_entries $a | iconv -f iso8859-15 -t utf-8 > ../sorted/$a; done + +cd ../output + diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c index 26ac9d0..4c30ea3 100644 --- a/src/postprocess/sort_plz.c +++ b/src/postprocess/sort_plz.c @@ -17,10 +17,10 @@ #include #include -enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_16, F_COUNT }; +enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_15, F_16, F_COUNT }; static char *g_filenames[] = { - "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_15_Email_Webadresse", "16_Koordinaten" }; + "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_Webadresse", "15_Email", "16_Koordinaten" }; typedef struct { char plz[8]; -- cgit v1.2.3