/* ============================================================================ Name : DeadBrain1_ext.c Author : Dave Shadoff Version : Copyright : (C) 2018 Dave Shadoff Description : Extractor for text of Dead of the Brain 1 (and 2) ============================================================================ */ #include #include #include #include #include #define DEAD1 // extract for dead of the brain 1 //#define DEAD2 // extract for dead of the brain 2 // modify these two as needed: // #define UPDATE 0 // 0 = test pass (do not update database); 1 = update database #define START_STRING 1 // in case we only run partial script extracts // Dead of the Brain 1 (and 2) - extract program re-implementation // int compare_ignoring_CR(char * str1, char * str2); int Unicode_to_SJIS(int uni); int SJIS_to_Unicode(int sjis); int SJIS_to_UTF8(int sjis, unsigned char *buf); void read_Unicode_map(void); int cvt_to_tokens(unsigned char *out, unsigned char *in, int len); int cvt_to_unicode(unsigned char *out, unsigned char *in, int len); int SJIS_to_Unicode_array[65536]; int Unicode_to_SJIS_array[65536]; int SJIS_map_read_flag = 0; unsigned char Unicode_buf[5]; char * currPath; #define PATH_SEPARATOR "/" char * map_fname = "unicode.txt"; #ifdef DEAD1 char * db_fname = "DotB1_text.db"; #endif #ifdef DEAD2 char * db_fname = "DotB2_text.db"; #endif char * iso_name = "/mnt/hgfs/Documents/media/games/Dead of the Brain I & II (J)/Dead of the Brain I & II (J)-02.iso"; typedef struct token { unsigned char byte_val; int len; char * tokenname; } token; #define ENDMSG_0 0 #define ENDMSG_1 1 #define WAITKEY 2 // 2-byte #define CR 3 #define CODE04 4 // 2-byte #define TEXTSPEED 5 // 2-byte #define CODE06 6 // 2-byte #define PRINTFLAGS 7 // 2-byte #define CLEAR 8 #define CODE09 9 // 2-byte #define TOPLEFT 10 #define WAIT 11 // 2-byte #define FASTTEXTKEY 12 // 2-byte #define NUM_TOKENS 13 token token_list[NUM_TOKENS] = { {0, 1, "" }, {1, 1, "" }, {2, 2, "\n" }, {4, 2, "" }, {9, 2, "\n" }, {11, 2, " 0x9FFF) || (currptr < 0x4000)) break; // update non-abbreviated list of pointers: pointer_diskaddr[num_of_pointers] = pointer_pointer; pointer_deref[num_of_pointers] = currptr; num_of_pointers++; if (currptr < minptr) minptr = currptr; if (currptr > maxptr) maxptr = currptr; // printf ("Found pointer %x\n", currptr); // if this is the first pointer, it is unique by definition; otherwise search for duplication if (numUniqPtrs == 0) { ptrList[numUniqPtrs] = currptr; numUniqPtrs++; } else { found = 0; for (k = 0; k < numUniqPtrs; k++) { if (ptrList[k] == currptr) { found = 1; } } if (found == 0) { ptrList[numUniqPtrs] = currptr; numUniqPtrs++; } } ptrindex++; } printf ("unique pointer list size = %d\n", numUniqPtrs); // Now, we have a list of unique pointers; we should sort it now // sort the list (not-very-smart sort): for (j= 0; j < numUniqPtrs; j++) { for (k = 0; k < (numUniqPtrs - 1); k++) { if (ptrList[(k+1)] < ptrList[k]) { tmpPtr = ptrList[k]; ptrList[k] = ptrList[(k+1)]; ptrList[(k+1)] = tmpPtr; } } } // for ( j = 0; j < numUniqPtrs; j++) { printf ("string # %d: file %d, %d (%4.4x)\n", (j+1), block[block_num][1], ptrList[j], ptrList[j]); } // ptrindex = 0; stringnum = START_STRING; currloc = ptrList[0]; lastloc = ptrList[numUniqPtrs-1]; while (currloc <= lastloc) { srcpos = currloc - offsetPtr; // currloc is resident location; srcpos is offset from block start in_ptr_list = 0; for (ptr_iter=0; ptr_iter < numUniqPtrs; ptr_iter++) { if (currloc == ptrList[ptr_iter]) { in_ptr_list = 1; printf("Found currloc %4.4x at pointer #%d\n", currloc, ptr_iter+1); break; } } if (in_ptr_list == 0) { printf("Currloc %4.4x not found in pointer list\n", currloc); fprintf(fout_err, "Currloc %4.4x not found in pointer list\n", currloc); } if (memcmp(&ffstring[0], &track_data[block_start + srcpos], 16) == 0) { // then this is a blank space - either pointers point to nowhere, // or to be used a s a buffer for dynamically forming a text message (not likely) memcpy(&raw_string[0], &ffstring[0], 16); memset(&tokenized_string[0], '\0', 9999); strcpy(tokenized_string, "[FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF]"); tokenized_strlen = strlen(tokenized_string); memset(&unicode_string[0], '\0', 9999); strcpy(unicode_string, "[FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF][FF]"); unicode_strlen = strlen(unicode_string); tmpcount = 16; } else { // normal string - traverse until end is found, making sure to skip over // tokens' second-byte values (which could otherwise appear as message_end tokens) // tmpcount = 0; while (srcpos + tmpcount <= 65536) { tmpbyte = track_data[block_start + srcpos + tmpcount]; // if this is a 2-byte token, then don't mistake the second byte as a token itself (i.e. don't end prematurely) // if ( (tmpbyte < NUM_TOKENS) && token_list[tmpbyte].len > 1) { tmpcount += (token_list[tmpbyte].len - 1); } else if ((tmpbyte == ENDMSG_0) || (tmpbyte == ENDMSG_1)) { break; } tmpcount++; } tmpcount++; // grab the string (starting at srcpos, for a length of tmpcount) memcpy(&raw_string[0], &track_data[block_start + srcpos], tmpcount); memset(&tokenized_string[0], '\0', 9999); tokenized_strlen = cvt_to_tokens(&tokenized_string[0], &raw_string[0], tmpcount); memset(&unicode_string[0], '\0', 9999); unicode_strlen = cvt_to_unicode(&unicode_string[0], &tokenized_string[0], tokenized_strlen); } // verify whether string has a pointer pointing to it // verify whether pointer points into the middle of it printf("----- ----- -----\n"); printf("stringnum = %5.5d, location = %4.4x, orig len = %4.4x, end_location = %4.4x; pointer #%d\n\n", stringnum, currloc, tmpcount, (currloc+tmpcount-1), (in_ptr_list ? (ptr_iter+1) : 0)); fprintf(fout_sjis, "----- ----- -----\n"); fprintf(fout_sjis, "stringnum = %5.5d, location = %4.4x, orig len = %4.4x, end_location = %4.4x, len (including token expansion) = %4.4x, pointed to by pointer #%d\n\n", stringnum, currloc, tmpcount, (currloc+tmpcount-1), tokenized_strlen, (in_ptr_list ? (ptr_iter+1) : 0)); fprintf(fout_utf, "----- ----- -----\n"); fprintf(fout_utf, "stringnum = %5.5d, location = %4.4x, orig len = %4.4x, end_location = %4.4x, len (including token and UTF expansion) = %4.4x, pointed to by pointer #%d\n\n", stringnum, currloc, tmpcount, (currloc+tmpcount-1), unicode_strlen, (in_ptr_list ? (ptr_iter+1) : 0)); // fetch database entry from table tblTransData and: // - verify existence // - verify length // - verify UTF string char *sql = "SELECT NumBytes, Jp_Text_Dump FROM tblTransData WHERE FileId = ? AND Address = ?"; rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL); if (rc == SQLITE_OK) { sqlite3_bind_int(stmt, 1, block[block_num][1]); sqlite3_bind_int(stmt, 2, currloc); } else { printf("Failed to execute statement: %s\n", sqlite3_errmsg(db)); exit(1); } result_count = 0; while (sqlite3_step(stmt) != SQLITE_DONE) { result_count++; if (result_count > 1) { printf("result # %d ", result_count); } msglen_fromdb = sqlite3_column_int(stmt, 0); strcpy(utfextract_fromdb, sqlite3_column_text(stmt, 1)); printf("extract length = %d, database says: %d\n", tmpcount, msglen_fromdb); fprintf(fout_err, "file = %d, address = %d, extract length = %d, database says: %d\n", block[block_num][1], currloc, tmpcount, msglen_fromdb); if (compare_ignoring_CR(unicode_string, utfextract_fromdb) == 1) { fprintf(fout_err, "Strings don't match\n"); fprintf(fout_err, "Extracted Unicode_string:\n%s\n\nString from Database:\n%s\n\n", unicode_string, utfextract_fromdb); in_ptr_list = 2; } else { fprintf(fout_err, "strings match\n"); } if (result_count > 1) { exit(1); } } if (result_count == 0) { fprintf(fout_err, "file = %d, address = %d, string didn't have a database match\n", block[block_num][1], currloc); printf("file = %d, address = %d, string didn't have a database match\n", block[block_num][1], currloc); insert_0_update_1 = 0; // exit(1); } else { insert_0_update_1 = 1; } sqlite3_finalize(stmt); // // if (update == 1) { sprintf(diskaddr_hex, "0x%7.7X", (block_start + srcpos)); sprintf(addr_hex, "0x%4.4X", currloc); sprintf(numbyte_hex, "0x%4.4X", tmpcount); sprintf(endaddr_hex, "0x%4.4X", (currloc+tmpcount-1)); if (in_ptr_list == 1) { // found,, so it's a normal message not_message = 0; } else if (in_ptr_list == 2) { // string doesn't match database, so special case not_message = 2; } else { not_message = 1; // strings match, but no pointers point at it } if (insert_0_update_1 == 0) { char *insert_sql = "INSERT INTO tblTransData " "(Stringnum, FileId, Address, NumBytes, DiskAddr_Hex, Address_Hex, Numbytes_Hex, End_Addr_Hex, Orig_Data, Not_Message, Use_Orig_Data, " " Jp_Text_Dump, Modified_Date, Final_Coded_Uptodate, Comment, Translation_Complete, Insertion_Complete, Validated) " "VALUES " "( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, " " ?, datetime('now'), 0, 'Inserted by second extract', 0, 0, 0);"; rc = sqlite3_prepare_v2(db, insert_sql, -1, &stmt, NULL); if (rc == SQLITE_OK) { sqlite3_bind_int(stmt, 1, stringnum); sqlite3_bind_int(stmt, 2, block[block_num][1]); sqlite3_bind_int(stmt, 3, currloc); sqlite3_bind_int(stmt, 4, tmpcount); sqlite3_bind_text(stmt, 5, diskaddr_hex, -1, NULL); sqlite3_bind_text(stmt, 6, addr_hex, -1, NULL); sqlite3_bind_text(stmt, 7, numbyte_hex, -1, NULL); sqlite3_bind_text(stmt, 8, endaddr_hex, -1, NULL); sqlite3_bind_blob(stmt, 9, raw_string, tmpcount, NULL); sqlite3_bind_int(stmt, 10, not_message); sqlite3_bind_text(stmt, 11, unicode_string, -1, NULL); } else { printf("Failed to execute statement: %s\n", sqlite3_errmsg(db)); exit(1); } } else // else it's an update { char *update_sql = "UPDATE tblTransData " "SET Stringnum = ?, DiskAddr_Hex = ?, Address_Hex = ?, NumBytes_Hex = ?, End_Addr_Hex = ?, Orig_Data = ?, Not_Message = ? " "WHERE FileId = ? AND Address = ?"; rc = sqlite3_prepare_v2(db, update_sql, -1, &stmt, NULL); if (rc == SQLITE_OK) { sqlite3_bind_int(stmt, 1, stringnum); sqlite3_bind_text(stmt, 2, diskaddr_hex, -1, NULL); sqlite3_bind_text(stmt, 3, addr_hex, -1, NULL); sqlite3_bind_text(stmt, 4, numbyte_hex, -1, NULL); sqlite3_bind_text(stmt, 5, endaddr_hex, -1, NULL); sqlite3_bind_blob(stmt, 6, raw_string, tmpcount, NULL); sqlite3_bind_int(stmt, 7, not_message); sqlite3_bind_int(stmt, 8, block[block_num][1]); sqlite3_bind_int(stmt, 9, currloc); } else { printf("Failed to execute statement: %s\n", sqlite3_errmsg(db)); exit(1); } } result_code = sqlite3_step(stmt); if (result_code != SQLITE_DONE) { printf("Error #%d: %s\n", result_code, sqlite3_errmsg(db)); sqlite3_close(db); exit(1); } sqlite3_finalize(stmt); } // // End of block controlled by 'update' flag // simple hex dump pointer_pointer = block_start + srcpos; for (i = 0; i < ((tmpcount/16) + 1); i++) { // printf ("%8.8x: ", pointer_pointer + (i*16)); for (k = 0; k < 16; k++) { lsbyte = raw_string[((i*16) + k)]; // printf ("%2.2x ", lsbyte); if (((i*16) + k + 2) > tmpcount) { break; } } // printf ("\n"); if (((i*16) + 16) > tmpcount) { break; } } // printf("Tokenized:\n%s\n", tokenized_string); printf("Unicode:\n%s\n", unicode_string); // printf("Unicode_strlen = %d\n", unicode_strlen); fprintf(fout_sjis, "%s\n", tokenized_string); fprintf(fout_utf, "%s\n", unicode_string); // NEXT: // check whether the string is referenced by pointer // check whether a pointer points to middle of string currloc = currloc + tmpcount; stringnum++; } // // Now, upsert into the tblPointers table // if (update == 1) { for (i = 0; i < num_of_pointers; i++) { char *upsert_sql = "INSERT INTO tblPointers(DiskAddr_Hex, FileId, Ptr_Deref, Ptr_Deref_Hex) " "VALUES (?, ?, ?, ?) " "ON CONFLICT (DiskAddr_Hex) " "DO UPDATE SET DiskAddr_Hex=excluded.DiskAddr_Hex, FileId=excluded.FileId, Ptr_Deref=excluded.Ptr_Deref, Ptr_Deref_Hex=excluded.Ptr_Deref_Hex;"; rc = sqlite3_prepare_v2(db, upsert_sql, -1, &stmt, NULL); sprintf(diskaddr_hex, "0x%7.7X", pointer_diskaddr[i]); sprintf(addr_hex, "0x%4.4X", pointer_deref[i]); if (in_ptr_list == 1) not_message = 0; else not_message = 1; if (rc == SQLITE_OK) { sqlite3_bind_text(stmt, 1, diskaddr_hex, -1, NULL); sqlite3_bind_int(stmt, 2, block[block_num][1]); sqlite3_bind_int(stmt, 3, pointer_deref[i]); sqlite3_bind_text(stmt, 4, addr_hex, -1, NULL); } else { printf("Failed to execute statement: %s\n", sqlite3_errmsg(db)); exit(1); } result_code = sqlite3_step(stmt); if (result_code != SQLITE_DONE) { printf("Error #%d: %s\n", result_code, sqlite3_errmsg(db)); sqlite3_close(db); exit(1); } sqlite3_finalize(stmt); } } fclose(fout_sjis); fclose(fout_utf); printf("Done block %d\n", block[block_num][1]); } // Next block (iterator 'block_num') fclose(fin); sqlite3_close(db); return EXIT_SUCCESS; } int compare_ignoring_CR(char * str1, char * str2) { int pos1, pos2; int equal = 0; pos1 = 0; pos2 = 0; while (1) { if ((str1[pos1] == 0x00) && (str2[pos2] == 0x00)) { break; } if ((str1[pos1] == 0x0a) || (str1[pos1] == 0x0d)) { pos1++; continue; } if ((str2[pos2] == 0x0a) || (str2[pos2] == 0x0d)) { pos2++; continue; } if (str1[pos1] == str2[pos2]) { pos1++; pos2++; continue; } equal = 1; break; } if (equal ==1) printf("Failed comparison at str1_pos = %d (%c), str2_pos = %d (%c)\n", pos1, str1[pos1], pos2, str2[pos2]); return(equal); } int cvt_to_tokens(unsigned char *out, unsigned char *in, int len) { int in_offset = 0; int out_offset = 0; unsigned char tmp_byte, tmp_byte1; while (in_offset < len) { tmp_byte = *(in+in_offset); tmp_byte1 = *(in+in_offset+1); // tokens are the first few possible ASCII characters and contiguous // if found, print name // if (tmp_byte < NUM_TOKENS) { memcpy((out+out_offset), token_list[tmp_byte].tokenname, strlen(token_list[tmp_byte].tokenname)); out_offset += strlen(token_list[tmp_byte].tokenname); // if it's a 2-byte token, then it needs to have the second byte in the token (and closing brace) if (token_list[tmp_byte].len == 2) { sprintf((char *)(out + out_offset), "%2.2X>", tmp_byte1); out_offset += 3; } in_offset += token_list[tmp_byte].len; } else { // else, it's just a normal character (or partial charater for 2-byte codes) // *(out+out_offset) = *(in + in_offset); out_offset++; in_offset++; } } *(out+out_offset) = '\0'; return out_offset; } // Note: we will use UTF-8 as 'Unicode' here int cvt_to_unicode(unsigned char *out, unsigned char *in, int len) { int in_offset = 0; int out_offset = 0; unsigned char tmp_byte, tmp_byte1; int tmp_int, utf_len; int i; while (in_offset < len) { tmp_byte = *(in+in_offset); tmp_byte1 = *(in+in_offset+1); // SJIS is 2-byte from 0x81 to 0x9F, and 0xE0 onward. // 1-byte up to 0x80, and from 0xA0 to 0xDF if (((tmp_byte > 0x80) && (tmp_byte < 0xA0)) || (tmp_byte > 0xDF)) { tmp_int = (int)(tmp_byte * 256) + tmp_byte1; utf_len = SJIS_to_UTF8(tmp_int, Unicode_buf); for (i = 0; i < utf_len; i++) { *(out+out_offset) = *(Unicode_buf+i); out_offset++; } in_offset +=2; } else // 1-byte { *(out+out_offset) = *(in + in_offset); out_offset++; in_offset++; } } *(out+out_offset) = '\0'; return(out_offset); } int SJIS_to_UTF8(int sjis, unsigned char *buf) { // instead of returning a 2-byte character, this can return up to three bytes; length is the return value int unicode_val; int len; unicode_val = SJIS_to_Unicode(sjis); if (unicode_val == 0) { printf("no unicode translation for %4.4x\n", sjis); } if (unicode_val < 0x80) { *(buf) = (unsigned char)unicode_val; len = 1; } else if (unicode_val < 0x800) { *(buf) = 0xC0 | (unsigned char)(unicode_val >> 6); *(buf+1) = 0x80 | (unsigned char)(unicode_val & 0x3f); len = 2; } else { *(buf) = 0xE0 | (unsigned char)(unicode_val >> 12); *(buf+1) = 0x80 | (unsigned char)((unicode_val & 0xfff) >> 6); *(buf+2) = 0x80 | (unsigned char)(unicode_val & 0x3f); len = 3; } return(len); } int SJIS_to_Unicode(int sjis) { if (SJIS_map_read_flag == 0) read_Unicode_map(); return(SJIS_to_Unicode_array[sjis]); } int Unicode_to_SJIS(int uni) { if (SJIS_map_read_flag == 0) read_Unicode_map(); return(Unicode_to_SJIS_array[uni]); } void read_Unicode_map(void) { FILE *uni_file; int SJIS_index, Unicode_index; int temp; int c; int line; char filename_unimap[1000]; /* build the filename for the Unicode map file */ filename_unimap[0] = '\0'; strcat(filename_unimap, currPath); strcat(filename_unimap, PATH_SEPARATOR); strcat(filename_unimap, map_fname); uni_file = fopen(filename_unimap, "rb"); if (uni_file == NULL) { printf("could not open unicode mapping file '%s'\n", filename_unimap); exit(1); } line = 1; c = fgetc(uni_file); while (!feof(uni_file)) { // get a line while (((c == 0x0d) || (c == 0x0a) || (c == 0x20)) && (!feof(uni_file))) { // leading spaces if (c == 0x0a) { line++; } c = fgetc(uni_file); } SJIS_index = 0; Unicode_index = 0; if (c == '0') { // starts out with 0x c = fgetc(uni_file); if ((c == 'x') && (!feof(uni_file))) { c = fgetc(uni_file); while ((c != 0x20) && (c != 0x09)) { temp = 0; if ((c >= '0') && (c <= '9')) { temp = (c - '0'); } else if ((c >= 'A') && (c <= 'F')) { temp = (c - 'A') + 10; } else if ((c >= 'a') && (c <= 'f')) { temp = (c - 'a') + 10; } else printf("Parsing error, line #%d -> SJIS index character = %c, code = %d\n", line, c, c); SJIS_index = (SJIS_index * 16) + temp; c = fgetc(uni_file); } } } while (((c == 0x20) || (c == 0x09)) && (!feof(uni_file))) { // intermediate spaces c = fgetc(uni_file); } if ((c == 'U') && (!feof(uni_file))) { // starts out with U+ c = fgetc(uni_file); if (c == '+') { c = fgetc(uni_file); while ((c != 0x20) && (c != 0x09) && (c != '#') && (c != 0x0d) && (c != 0x0a) && (!feof(uni_file))) { temp = 0; if ((c >= '0') && (c <= '9')) { temp = (c - '0'); } else if ((c >= 'A') && (c <= 'F')) { temp = (c - 'A') + 10; } else if ((c >= 'a') && (c <= 'f')) { temp = (c - 'a') + 10; } else printf("Parsing error, line #%d -> Unicode index character = %c, code = %d\n", line, c, c); Unicode_index = (Unicode_index * 16) + temp; c = fgetc(uni_file); } } } SJIS_to_Unicode_array[SJIS_index] = Unicode_index; Unicode_to_SJIS_array[Unicode_index] = SJIS_index; while (((c != 0x0d) && (c != 0x0a)) && (!feof(uni_file))) { // trailing part of line if (c == 0x0a) { line++; } c = fgetc(uni_file); } } SJIS_map_read_flag = 1; return; }