diff options
author | Dirk Engling <erdgeist@erdgeist.org> | 2019-02-04 22:30:00 +0100 |
---|---|---|
committer | Dirk Engling <erdgeist@erdgeist.org> | 2019-02-04 22:30:00 +0100 |
commit | 001acb04b78393acbd69d02a9a4d70f878d921d4 (patch) | |
tree | 0ddf40e6197f1e9656c18855491a85ff02e2e314 /src/postprocess | |
parent | 2d1db4b17f5afa5dcf1f5d586a64bc3b72876084 (diff) |
Make sort_plz work on single files instead of creating all the columns
Diffstat (limited to 'src/postprocess')
-rw-r--r-- | src/postprocess/sort_plz.c | 121 |
1 files changed, 76 insertions, 45 deletions
diff --git a/src/postprocess/sort_plz.c b/src/postprocess/sort_plz.c index f44cec7..26ac9d0 100644 --- a/src/postprocess/sort_plz.c +++ b/src/postprocess/sort_plz.c | |||
@@ -4,7 +4,7 @@ | |||
4 | <PLZ> is either [0-9_]{5} or _____ or brken | 4 | <PLZ> is either [0-9_]{5} or _____ or brken |
5 | opens files in source directory: | 5 | opens files in source directory: |
6 | 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl | 6 | 01_Flags 02_Nachname 03_Vorname 04_Zusaetze 07_Strasse 08_Hausnummer 09_Verweise 10_Postleitzahl |
7 | 11_Ort 12_Vorwahl 13_Rufnummer 16_Koordinaten | 7 | 11_Ort 12_Vorwahl 13_Rufnummer 14_15_Email_Webadresse 16_Koordinaten |
8 | appends to all of the above dirs plus | 8 | appends to all of the above dirs plus |
9 | 00_Jahr | 9 | 00_Jahr |
10 | */ | 10 | */ |
@@ -14,12 +14,21 @@ | |||
14 | #include <stdio.h> | 14 | #include <stdio.h> |
15 | #include <string.h> | 15 | #include <string.h> |
16 | #include <errno.h> | 16 | #include <errno.h> |
17 | #include <unistd.h> | ||
17 | #include <err.h> | 18 | #include <err.h> |
18 | 19 | ||
19 | enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_16, F_COUNT }; | 20 | enum { F_00, F_01, F_10, F_02, F_03, F_04, F_07, F_08, F_09, F_11, F_12, F_13, F_14, F_16, F_COUNT }; |
20 | 21 | ||
21 | static char *g_filenames[] = { | 22 | static char *g_filenames[] = { |
22 | "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "16_Koordinaten" }; | 23 | "00_Jahr", "01_Flags", "10_Postleitzahl", "02_Nachname", "03_Vorname", "04_Zusaetze", "07_Strasse", "08_Hausnummer", "09_Verweise", "11_Ort", "12_Vorwahl", "13_Rufnummer", "14_15_Email_Webadresse", "16_Koordinaten" }; |
24 | |||
25 | typedef struct { | ||
26 | char plz[8]; | ||
27 | FILE * file; | ||
28 | } outhandle; | ||
29 | |||
30 | static outhandle g_outhandles[32*1024]; | ||
31 | static int g_outhandle_count; | ||
23 | 32 | ||
24 | FILE * fopen_prefix(char *prefix, int file_id, int readonly) { | 33 | FILE * fopen_prefix(char *prefix, int file_id, int readonly) { |
25 | char filename[1024]; | 34 | char filename[1024]; |
@@ -27,10 +36,50 @@ FILE * fopen_prefix(char *prefix, int file_id, int readonly) { | |||
27 | return fopen(filename, readonly ? "r" : "a"); | 36 | return fopen(filename, readonly ? "r" : "a"); |
28 | } | 37 | } |
29 | 38 | ||
39 | /* This function gives us a binary search that returns a pointer, even if | ||
40 | no exact match is found. In that case it sets exactmatch 0 and gives | ||
41 | calling functions the chance to insert data | ||
42 | */ | ||
43 | void *binary_search( const void * const key, const void * base, const size_t member_count, const size_t member_size, | ||
44 | size_t compare_size, int *exactmatch ) { | ||
45 | size_t interval = member_count; | ||
46 | |||
47 | while( interval ) { | ||
48 | uint8_t *lookat = ((uint8_t*)base) + member_size * ( interval / 2 ); | ||
49 | int cmp = memcmp( lookat, key, compare_size ); | ||
50 | if(cmp == 0 ) { | ||
51 | base = lookat; | ||
52 | break; | ||
53 | } | ||
54 | if(cmp < 0) { | ||
55 | base = lookat + member_size; | ||
56 | interval--; | ||
57 | } | ||
58 | interval /= 2; | ||
59 | } | ||
60 | |||
61 | *exactmatch = interval; | ||
62 | return (void*)base; | ||
63 | } | ||
64 | |||
65 | FILE * get_file_for_postleitzahl(char *plz) { | ||
66 | int exactmatch = 0; | ||
67 | outhandle * oh = (outhandle *)binary_search(plz, g_outhandles, g_outhandle_count, sizeof(outhandle), 5, &exactmatch); | ||
68 | if (!exactmatch) { | ||
69 | size_t s = (g_outhandles + g_outhandle_count) - oh; | ||
70 | memmove(oh + 1, oh, s * sizeof(outhandle)); | ||
71 | oh->file = fopen(plz, "a"); | ||
72 | if (!oh->file) errx( 1, "Couldn't open file %s for writing\n", plz); | ||
73 | memcpy(oh->plz, plz, 5); | ||
74 | g_outhandle_count++; | ||
75 | } | ||
76 | return oh->file; | ||
77 | } | ||
78 | |||
30 | int main(int argc, char **args) { | 79 | int main(int argc, char **args) { |
31 | FILE * in_handles[F_COUNT] = { NULL }; | 80 | FILE * in_handles[F_COUNT] = { NULL }; |
32 | FILE * out_handles[F_COUNT] = { NULL }; | 81 | FILE * out_handle = NULL; |
33 | char flags[4]; | 82 | char flags[4], outfile[6]; |
34 | int i, in_multi = 0; | 83 | int i, in_multi = 0; |
35 | char *input = malloc(1024); | 84 | char *input = malloc(1024); |
36 | size_t input_size = 1024; | 85 | size_t input_size = 1024; |
@@ -42,33 +91,26 @@ int main(int argc, char **args) { | |||
42 | errx( 1, "Couldn't open file %s\n", g_filenames[i]); | 91 | errx( 1, "Couldn't open file %s\n", g_filenames[i]); |
43 | } | 92 | } |
44 | 93 | ||
45 | mkdir( "multi", 0755); | 94 | mkdir("output", 0755); |
46 | mkdir( "single", 0755); | 95 | chdir("output"); |
47 | 96 | ||
48 | /* Get Flags to check if we're processing a continuation */ | 97 | /* Get Flags to check if we're processing a continuation */ |
49 | while (fgets(flags, 4, in_handles[F_01])) { | 98 | while (fgets(flags, 4, in_handles[F_01])) { |
50 | char out_dir[32]; | ||
51 | ssize_t linelen; | 99 | ssize_t linelen; |
52 | char flag = strtoul(flags, 0, 16); | 100 | char flag = strtoul(flags, 0, 16); |
53 | char *type = flag & 1 ? "multi/" : "single/"; | ||
54 | 101 | ||
55 | /* If we're in multiline mode, we just copy lines as long as we see continuations */ | 102 | /* If we're in multiline mode, we just copy lines as long as we see continuations */ |
56 | if (in_multi) { | 103 | if (in_multi) { |
57 | if (flag & 0x2) { | 104 | if (flag & 0x2) { |
58 | fputs(args[1], out_handles[F_00]); // write Jahr | 105 | fputs(args[1], out_handle); // write Jahr |
59 | fputc(10, out_handles[F_00]); | 106 | fputc(10, out_handle); |
60 | fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim | 107 | fwrite(flags, 3, 1, out_handle); // copy Flags verbatim |
61 | for (i=F_10; i<F_COUNT; ++i) { // process the rest of entries | 108 | for (i=F_10; i<F_COUNT; ++i) { // process the rest of entries |
62 | ssize_t linelen = getline(&input, &input_size, in_handles[i]); | 109 | ssize_t linelen = getline(&input, &input_size, in_handles[i]); |
63 | fwrite(input, linelen, 1, out_handles[i]); | 110 | fwrite(input, linelen, 1, out_handle); |
64 | } | 111 | } |
65 | continue; | 112 | continue; |
66 | } | 113 | } |
67 | /* If the entry is not a continuation, close all output files and switch off multi mode */ | ||
68 | for (i=0; i<F_COUNT; ++i) { | ||
69 | fclose(out_handles[i]); | ||
70 | out_handles[i] = NULL; | ||
71 | } | ||
72 | in_multi = 0; | 114 | in_multi = 0; |
73 | } | 115 | } |
74 | 116 | ||
@@ -83,50 +125,39 @@ int main(int argc, char **args) { | |||
83 | } | 125 | } |
84 | 126 | ||
85 | if (linelen == 0) // empty PLZ | 127 | if (linelen == 0) // empty PLZ |
86 | strcpy(out_dir, in_multi ? "multi/_____" : "single/_____"); | 128 | strcpy(outfile, "_____"); |
87 | else if (linelen == 5) { // potentially normal | 129 | else if (linelen == 5) { // potentially normal |
88 | int broken = 0; | 130 | int broken = 0; |
89 | char * dest = out_dir + sprintf(out_dir, in_multi ? "multi/" : "single/"); | ||
90 | for (i=0; i<5; ++i) { | 131 | for (i=0; i<5; ++i) { |
91 | if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') { | 132 | if ( (input[i] < '0' || input[i] > '9') && input[i] != '.') { |
92 | broken = 1; | 133 | broken = 1; |
93 | break; | 134 | break; |
94 | } | 135 | } |
95 | dest[i] = input[i]; | 136 | outfile[i] = input[i]; |
96 | if (dest[i] == '.') dest[i] = '_'; | 137 | if (outfile[i] == '.') outfile[i] = '_'; |
97 | } | 138 | } |
98 | dest[5] = 0; | 139 | outfile[5] = 0; |
99 | if (broken) | 140 | if (broken) |
100 | strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); | 141 | strcpy(outfile, "brken"); |
101 | } else | 142 | } else |
102 | strcpy(out_dir, in_multi ? "multi/broken" : "single/broken"); | 143 | strcpy(outfile, "brken"); |
103 | 144 | ||
104 | if (mkdir(out_dir, 0755) == -1 && errno != EEXIST) | 145 | out_handle = get_file_for_postleitzahl(outfile); |
105 | errx( 1, "Couldn't create directory %s %d\n", out_dir, errno); | ||
106 | 146 | ||
107 | for (i=F_00; i<F_COUNT; ++i) { | 147 | fputs(args[1], out_handle); // write Jahr |
108 | out_handles[i] = fopen_prefix(out_dir, i, 0); | 148 | fputc(10, out_handle); |
109 | if (!out_handles[i]) | 149 | fwrite(flags, 3, 1, out_handle); // copy Flags verbatim |
110 | errx( 1, "Couldn't open file %s\n", g_filenames[i]); | 150 | fputs(input, out_handle); // copy Postleitzahl verbatim |
111 | } | 151 | fputc(10, out_handle); |
112 | |||
113 | fputs(args[1], out_handles[F_00]); // write Jahr | ||
114 | fputc(10, out_handles[F_00]); | ||
115 | fwrite(flags, 3, 1, out_handles[F_01]); // copy Flags verbatim | ||
116 | fputs(input, out_handles[F_10]); // copy Postleitzahl verbatim | ||
117 | fputc(10, out_handles[F_10]); | ||
118 | 152 | ||
119 | for (i=F_02; i<F_COUNT; ++i) { // process the rest of entries | 153 | for (i=F_02; i<F_COUNT; ++i) { // process the rest of entries |
120 | ssize_t linelen = getline(&input, &input_size, in_handles[i]); | 154 | ssize_t linelen = getline(&input, &input_size, in_handles[i]); |
121 | fwrite(input, linelen, 1, out_handles[i]); | 155 | fwrite(input, linelen, 1, out_handle); |
122 | } | 156 | } |
123 | |||
124 | if (!in_multi) | ||
125 | for (i=0; i<F_COUNT; ++i) { | ||
126 | fclose(out_handles[i]); | ||
127 | out_handles[i] = NULL; | ||
128 | } | ||
129 | } | 157 | } |
130 | 158 | ||
159 | for (i=0; i<g_outhandle_count; ++i) | ||
160 | fclose(g_outhandles[i].file); | ||
161 | |||
131 | return 0; | 162 | return 0; |
132 | } | 163 | } |