From 08a9f406a0e18e0902bdf4f50b4f5ecad9fe2588 Mon Sep 17 00:00:00 2001
From: Dirk Engling <erdgeist@erdgeist.org>
Date: Sun, 10 May 2015 21:45:16 +0200
Subject: Split entries into different column files. Done for
 non-continuation-entries

---
 src/export/split_version_2.c | 185 +++++++++++++++++++++++--------------------
 1 file changed, 99 insertions(+), 86 deletions(-)

diff --git a/src/export/split_version_2.c b/src/export/split_version_2.c
index 2b7a79f..7a6f04e 100644
--- a/src/export/split_version_2.c
+++ b/src/export/split_version_2.c
@@ -1,10 +1,20 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <ctype.h>
+#include <fcntl.h>
+#include <unistd.h>
 
 #include "mystdlib.h"
 
-static int g_first_field_length;
+enum { g_outfiles = 17 };
+
+static struct {
+  int      outfiles[g_outfiles];
+  uint8_t *outbuf  [g_outfiles];
+  size_t   outfill [g_outfiles];
+} g_state;
+
+static int g_northern_version;;
 static uint8_t cp437_to_iso8859_1_table[] = {
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
@@ -23,14 +33,24 @@ static uint8_t cp437_to_iso8859_1_table[] = {
   0x2e, 0xdf, 0x2e, 0x2e, 0x2e, 0x2e, 0xb5, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e,
   0x2e, 0xb1, 0x2e, 0x2e, 0x2e, 0x2e, 0xf7, 0x2e, 0xb0, 0x2e, 0xb7, 0x2e, 0x2e, 0xb2, 0x2e, 0xa0
 };
-static void dump_field( uint8_t **end) {
-  uint8_t *e = *end;
-  uint8_t len = *--e;
-  *end = e - len;
-  printf( "_%02d_ ", len );
+
+static void dump_string( uint8_t *start, size_t len, int file ) {
+  uint8_t *dest = g_state.outbuf[file] + g_state.outfill[file];
+  g_state.outfill[file] += len;
   while( len-- )
-    putchar( cp437_to_iso8859_1_table[ e[-len-1] ] );
-  putchar( 9 );
+    *(dest++) = cp437_to_iso8859_1_table[ *(start++) ];
+}
+
+static void dump_field( uint8_t **end, int file ) {
+  uint8_t len = (*end)[-1];
+  *end -= len + 1;
+  dump_string( *end, len, file);
+}
+
+static void finish_record( ) {
+  int i;
+  for( i=0; i<g_outfiles; ++i )
+    *( g_state.outbuf[i] + g_state.outfill[i]++ ) = '\n';
 }
 
 static uint16_t load_word( uint8_t *table_start, uint16_t base, int offset ) {
@@ -48,79 +68,45 @@ static uint16_t load_word( uint8_t *table_start, uint16_t base, int offset ) {
 
 static void dump_primary( uint8_t *end, uint32_t flags ) {
   /* First dump type of record (should be 1) */
-  putchar( *end );
-  putchar( 9 );
+  dump_string( end, 1, 0 );
 
   /* Dump first 5 chars of zip, they're always there */
-  printf( "%c%c%c%c%c\t", end[-5], end[-4], end[-3], end[-2], end[-1] );
-  end -= 5;
-
-  if( g_first_field_length == 5 ) {
-    /* Dump first 5 chars of prefix, they're always there */
-    printf( "%c%c%c%c%c\t", end[-5], end[-4], end[-3], end[-2], end[-1] );
-    end -= 5;
-
-    /* There is another version of the zip code present, if this bit is set */
-    if( flags & 0x0080 ) {
-      printf( "%c%c%c%c%c\t", end[-5], end[-4], end[-3], end[-2], end[-1] );
-      end -= 5;
-    }
+  dump_string( end -= 5, 5, 1 );
 
-    /* There is an unclear X present, if this bit is set */
-    if( flags & 0x0040 ) {
-       printf( "%c\t", end[-1] );
-       end--;
-    }
+  /* Dump first 5 chars of prefix, they're always here in s and w */
+  if( !g_northern_version )
+    dump_string( end -= 5, 5, 2 );
 
-    /* There is another version of the prefix present, if this bit is set */
-    if( flags & 0x0020 ) {
-      printf( "%c%c%c%c%c\t", end[-5], end[-4], end[-3], end[-2], end[-1] );
-      end -= 5;
-    }
+  /* There is another version of the zip code present, if this bit is set */
+  if( flags & 0x0080 ) dump_string( end -= 5, 5, 3 );
 
-    if( flags & 0x0010 ) dump_field( &end );
-    if( flags & 0x0008 ) dump_field( &end );
-    if( flags & 0x0004 ) dump_field( &end );
-    if( flags & 0x0002 ) dump_field( &end );
-    if( flags & 0x0001 ) dump_field( &end );
-    if( flags & 0x8000 ) dump_field( &end );
-    if( flags & 0x4000 ) dump_field( &end );
-    if( flags & 0x2000 ) dump_field( &end );
-    if( flags & 0x1000 ) dump_field( &end );
-
-    dump_field( &end );
-    dump_field( &end );
-  } else {
-
-    /* There is another version of the zip code present, if this bit is set */
-    if( flags & 0x0080 ) {
-      printf( "%c%c%c%c%c\t", end[-5], end[-4], end[-3], end[-2], end[-1] );
-      end -= 5;
-    }
+  /* There is an unclear X present, if this bit is set */
+  if( flags & 0x0040 ) dump_string( end -= 1, 1, 4 );
 
-    /* There is an unclear X present, if this bit is set */
-    if( flags & 0x0040 ) {
-       printf( "%c\t", end[-1] );
-       end--;
-    }
-
-    if( flags & 0x0010 ) dump_field( &end );
-    if( flags & 0x0008 ) dump_field( &end );
-    if( flags & 0x0004 ) dump_field( &end );
-    if( flags & 0x0002 ) dump_field( &end );
-    if( flags & 0x0001 ) dump_field( &end );
-    if( flags & 0x8000 ) dump_field( &end );
-    if( flags & 0x4000 ) dump_field( &end );
-    if( flags & 0x2000 ) dump_field( &end );
-    if( flags & 0x1000 ) dump_field( &end );
-
-    /* There is another version of the prefix present, if this bit is set */
-    if( flags & 0x0020 ) dump_field( &end );
-
-    dump_field( &end );
-    dump_field( &end );
-    dump_field( &end );
+  /* There is another version of the prefix present, if this bit is set */
+  if( flags & 0x0020 ) {
+    if( !g_northern_version )
+      dump_string( end -= 5, 5, 5 );
+    else
+      dump_field( &end, 5 );
   }
+
+  if( flags & 0x0010 ) dump_field( &end, 6 );
+  if( flags & 0x0008 ) dump_field( &end, 7 );
+  if( flags & 0x0004 ) dump_field( &end, 8 );
+  if( flags & 0x0002 ) dump_field( &end, 9 );
+  if( flags & 0x0001 ) dump_field( &end, 10 );
+  if( flags & 0x8000 ) dump_field( &end, 11 );
+  if( flags & 0x4000 ) dump_field( &end, 12 );
+  if( flags & 0x2000 ) dump_field( &end, 13 );
+  if( flags & 0x1000 ) dump_field( &end, 14 );
+
+  dump_field( &end, 15 );
+
+  /* Dump first 5 chars of prefix, they're always here in no */
+  if( g_northern_version )
+    dump_field( &end, 2 );
+  dump_field( &end, 16 );
 }
 
 static void act_on_record( uint8_t * end, uint16_t base ) {
@@ -129,55 +115,82 @@ static void act_on_record( uint8_t * end, uint16_t base ) {
   uint16_t num_dwords = rec[0];
   uint16_t flagbytes  = rec[1];
   uint32_t flags = 0;
-  int bold = 0, i;
+  int i;
 
   for (i=0; i<flagbytes; ++i) flags = (flags<<8) | *--end;
 
-  /*
-  putchar( 27 ) ; putchar( '[' ); putchar( '3' );
-  putchar( ( flags & test_me ) ? '1' : '2' );
-  putchar( 'm' ); */
-  (void)bold;
-
-  printf( "-------- %03d: %06X\n", num_dwords, flags );
+//  printf( "-------- %03d: %06X\n", num_dwords, flags );
   for (i = 0; i < num_dwords; i++ ) {
     uint16_t subflag = load_word( e, base, 2*i );
     uint16_t t_off   = load_word( e, base, 2*i+1 );
     uint8_t *rec_start = end - t_off;
 
-    printf( "%04x (%04x): ", subflag, t_off );
     if (i == 0)
       dump_primary( rec_start, flags );
     else {
       uint8_t * rec_end = end - ( ( i+1 == num_dwords ) ? 0 : load_word( e, base, 2*i+3 ) );
+      printf( "(%04X): ", subflag );
+//      dump_string( rec_start, rec_end, file );
       while ( rec_start < rec_end )
         putchar( cp437_to_iso8859_1_table[ *(rec_start++) ] );
+      putchar(10);
     }
-    putchar(10);
   }
+  finish_record();
 }
 
 int main( int args, char **argv ) {
+  char filename[1024];
   MAP data, index;
   uint32_t * indoff;
+  size_t limit;
+  int i;
 
   if( args != 3 ) {
     fprintf( stderr, "Syntax: %s <dumpfile> <indexfile>\n", argv[0] );
     exit(1);
   }
+
+  for( i=0; i<g_outfiles; ++i )
+  {
+    sprintf( filename, "%02d_unknown", i+1 );
+    g_state.outfiles[i] = open( filename, O_WRONLY | O_APPEND | O_CREAT, 0644 );
+    g_state.outbuf[i]   = malloc(8192*4096);
+    g_state.outfill[i]  = 0;
+  }
+
   data  = map_file( argv[1], 1 );
   index = map_file( argv[2], 1 );
 
   if( !data || !index )
     exit( 1 );
 
-  g_first_field_length = data->addr[0x21e];
+  g_northern_version = data->addr[0x21e] != 5;
+
+  /* Each entry in outfile[0] is flag + \n, i.e. 3 bytes
+     We want to flush at ever percent progress */
+  limit = 3 * ( ( index->size / 4 ) / 100 );
 
   for( indoff = (uint32_t*)(index->addr + 8);
        indoff < (uint32_t*)(index->addr + index->size) && *indoff;
        indoff++ ) {
 //        printf( "\nActing on record at off: %08X\n", *indoff );
-//        if( indoff[0] >> 14 < indoff[1] >> 14 )
         act_on_record( data->addr + *indoff, *indoff & 0x1fff );
+
+    /* Write out results */
+    if( g_state.outfill[0] > limit )
+      for( i=0; i<g_outfiles; ++i ) {
+        if( g_state.outfill[i] > 1024*1024*6 ) printf( "Large: %zd\n", g_state.outfill[i] );
+        write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] );
+        g_state.outfill[i]  = 0;
+      }
   }
+
+  for( i=0; i<g_outfiles; ++i ) {
+    write( g_state.outfiles[i], g_state.outbuf[i], g_state.outfill[i] );
+    close( g_state.outfiles[i] );
+  }
+  unmap_file( &data );
+  unmap_file( &index );
+
 }
-- 
cgit v1.2.3