summaryrefslogtreecommitdiff
path: root/src/postprocess/sort_plz.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/postprocess/sort_plz.c')
-rw-r--r--src/postprocess/sort_plz.c121
1 files changed, 76 insertions, 45 deletions
diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c
index f44cec7..26ac9d0 100644
--- a/src/postprocess/sort_plz.c
+++ b/src/postprocess/sort_plz.c
@@ -4,7 +4,7 @@
4 <PLZ> is either [0-9_]{5} or _____ or brken 4 <PLZ> is either [0-9_]{5} or _____ or brken
5 opens files in source directory: 5 opens files in source directory:
6 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl 6 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl
7 11_Ort 12_Vorwahl 13_Rufnummer 16_Koordinaten 7 11_Ort 12_Vorwahl 13_Rufnummer 14_15_Email_Webadresse 16_Koordinaten
8 appends to all of the above dirs plus 8 appends to all of the above dirs plus
9 00_Jahr 9 00_Jahr
10*/ 10*/
@@ -14,12 +14,21 @@
14#include <stdio.h> 14#include <stdio.h>
15#include <string.h> 15#include <string.h>
16#include <errno.h> 16#include <errno.h>
17#include <unistd.h>
17#include <err.h> 18#include <err.h>
18 19
19enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_16, F_COUNT }; 20enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_16, F_COUNT };
20 21
21static char *g_filenames[] = { 22static char *g_filenames[] = {
22 "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "16_Koordinaten" }; 23 "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_15_Email_Webadresse", "16_Koordinaten" };
24
25typedef struct {
26 char plz[8];
27 FILE * file;
28} outhandle;
29
30static outhandle g_outhandles[32*1024];
31static int g_outhandle_count;
23 32
24FILE * fopen_prefix(char *prefix, int file_id, int readonly) { 33FILE * fopen_prefix(char *prefix, int file_id, int readonly) {
25 char filename[1024]; 34 char filename[1024];
@@ -27,10 +36,50 @@ FILE * fopen_prefix(char *prefix, int file_id, int readonly) {
27 return fopen(filename, readonly ? "r" : "a"); 36 return fopen(filename, readonly ? "r" : "a");
28} 37}
29 38
39/* This function gives us a binary search that returns a pointer, even if
40 no exact match is found. In that case it sets exactmatch 0 and gives
41 calling functions the chance to insert data
42*/
43void *binary_search( const void * const key, const void * base, const size_t member_count, const size_t member_size,
44 size_t compare_size, int *exactmatch ) {
45 size_t interval = member_count;
46
47 while( interval ) {
48 uint8_t *lookat = ((uint8_t*)base) + member_size * ( interval / 2 );
49 int cmp = memcmp( lookat, key, compare_size );
50 if(cmp == 0 ) {
51 base = lookat;
52 break;
53 }
54 if(cmp < 0) {
55 base = lookat + member_size;
56 interval--;
57 }
58 interval /= 2;
59 }
60
61 *exactmatch = interval;
62 return (void*)base;
63}
64
65FILE * get_file_for_postleitzahl(char *plz) {
66 int exactmatch = 0;
67 outhandle * oh = (outhandle *)binary_search(plz, g_outhandles, g_outhandle_count, sizeof(outhandle), 5, &exactmatch);
68 if (!exactmatch) {
69 size_t s = (g_outhandles + g_outhandle_count) - oh;
70 memmove(oh + 1, oh, s * sizeof(outhandle));
71 oh->file = fopen(plz, "a");
72 if (!oh->file) errx( 1, "Couldn't open file %s for writing\n", plz);
73 memcpy(oh->plz, plz, 5);
74 g_outhandle_count++;
75 }
76 return oh->file;
77}
78
30int main(int argc, char **args) { 79int main(int argc, char **args) {
31 FILE * in_handles[F_COUNT] = { NULL }; 80 FILE * in_handles[F_COUNT] = { NULL };
32 FILE * out_handles[F_COUNT] = { NULL }; 81 FILE * out_handle = NULL;
33 char flags[4]; 82 char flags[4], outfile[6];
34 int i, in_multi = 0; 83 int i, in_multi = 0;
35 char *input = malloc(1024); 84 char *input = malloc(1024);
36 size_t input_size = 1024; 85 size_t input_size = 1024;
@@ -42,33 +91,26 @@ int main(int argc, char **args) {
42 errx( 1, "Couldn't open file %s\n", g_filenames[i]); 91 errx( 1, "Couldn't open file %s\n", g_filenames[i]);
43 } 92 }
44 93
45 mkdir( "multi", 0755); 94 mkdir("output", 0755);
46 mkdir( "single", 0755); 95 chdir("output");
47 96
48 /* Get Flags to check if we're processing a continuation */ 97 /* Get Flags to check if we're processing a continuation */
49 while (fgets(flags, 4, in_handles[F_01])) { 98 while (fgets(flags, 4, in_handles[F_01])) {
50 char out_dir[32];
51 ssize_t linelen; 99 ssize_t linelen;
52 char flag = strtoul(flags, 0, 16); 100 char flag = strtoul(flags, 0, 16);
53 char *type = flag & 1 ? "multi/" : "single/";
54 101
55 /* If we're in multiline mode, we just copy lines as long as we see continuations */ 102 /* If we're in multiline mode, we just copy lines as long as we see continuations */
56 if (in_multi) { 103 if (in_multi) {
57 if (flag & 0x2) { 104 if (flag & 0x2) {
58 fputs(args[1], out_handles[F_00]); // write Jahr 105 fputs(args[1], out_handle); // write Jahr
59 fputc(10, out_handles[F_00]); 106 fputc(10, out_handle);
60 fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim 107 fwrite(flags, 3, 1, out_handle); // copy Flags verbatim
61 for (i=F_10; i<F_COUNT; ++i) { // process the rest of entries 108 for (i=F_10; i<F_COUNT; ++i) { // process the rest of entries
62 ssize_t linelen = getline(&input, &input_size, in_handles[i]); 109 ssize_t linelen = getline(&input, &input_size, in_handles[i]);
63 fwrite(input, linelen, 1, out_handles[i]); 110 fwrite(input, linelen, 1, out_handle);
64 } 111 }
65 continue; 112 continue;
66 } 113 }
67 /* If the entry is not a continuation, close all output files and switch off multi mode */
68 for (i=0; i<F_COUNT; ++i) {
69 fclose(out_handles[i]);
70 out_handles[i] = NULL;
71 }
72 in_multi = 0; 114 in_multi = 0;
73 } 115 }
74 116
@@ -83,50 +125,39 @@ int main(int argc, char **args) {
83 } 125 }
84 126
85 if (linelen == 0) // empty PLZ 127 if (linelen == 0) // empty PLZ
86 strcpy(out_dir, in_multi ? "multi/_____" : "single/_____"); 128 strcpy(outfile, "_____");
87 else if (linelen == 5) { // potentially normal 129 else if (linelen == 5) { // potentially normal
88 int broken = 0; 130 int broken = 0;
89 char * dest = out_dir + sprintf(out_dir, in_multi ? "multi/" : "single/");
90 for (i=0; i<5; ++i) { 131 for (i=0; i<5; ++i) {
91 if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') { 132 if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') {
92 broken = 1; 133 broken = 1;
93 break; 134 break;
94 } 135 }
95 dest[i] = input[i]; 136 outfile[i] = input[i];
96 if (dest[i] == '.') dest[i] = '_'; 137 if (outfile[i] == '.') outfile[i] = '_';
97 } 138 }
98 dest[5] = 0; 139 outfile[5] = 0;
99 if (broken) 140 if (broken)
100 strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); 141 strcpy(outfile, "brken");
101 } else 142 } else
102 strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); 143 strcpy(outfile, "brken");
103 144
104 if (mkdir(out_dir, 0755) == -1 && errno != EEXIST) 145 out_handle = get_file_for_postleitzahl(outfile);
105 errx( 1, "Couldn't create directory %s %d\n", out_dir, errno);
106 146
107 for (i=F_00; i<F_COUNT; ++i) { 147 fputs(args[1], out_handle); // write Jahr
108 out_handles[i] = fopen_prefix(out_dir, i, 0); 148 fputc(10, out_handle);
109 if (!out_handles[i]) 149 fwrite(flags, 3, 1, out_handle); // copy Flags verbatim
110 errx( 1, "Couldn't open file %s\n", g_filenames[i]); 150 fputs(input, out_handle); // copy Postleitzahl verbatim
111 } 151 fputc(10, out_handle);
112
113 fputs(args[1], out_handles[F_00]); // write Jahr
114 fputc(10, out_handles[F_00]);
115 fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim
116 fputs(input, out_handles[F_10]); // copy Postleitzahl verbatim
117 fputc(10, out_handles[F_10]);
118 152
119 for (i=F_02; i<F_COUNT; ++i) { // process the rest of entries 153 for (i=F_02; i<F_COUNT; ++i) { // process the rest of entries
120 ssize_t linelen = getline(&input, &input_size, in_handles[i]); 154 ssize_t linelen = getline(&input, &input_size, in_handles[i]);
121 fwrite(input, linelen, 1, out_handles[i]); 155 fwrite(input, linelen, 1, out_handle);
122 } 156 }
123
124 if (!in_multi)
125 for (i=0; i<F_COUNT; ++i) {
126 fclose(out_handles[i]);
127 out_handles[i] = NULL;
128 }
129 } 157 }
130 158
159 for (i=0; i<g_outhandle_count; ++i)
160 fclose(g_outhandles[i].file);
161
131 return 0; 162 return 0;
132} 163}