/* Copyright (C) 1997, 2000 artofcode LLC. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA, 02111-1307. */ /*$Id: gsfcmap.c,v 1.14.2.2.2.1 2003/01/17 00:49:02 giles Exp $ */ /* CMap character decoding */ #include "memory_.h" #include "errors.h" #include "gx.h" #include "gserrors.h" #include "gsstruct.h" #include "gsutil.h" /* for gs_next_ids */ #include "gxfcmap.h" /* GC descriptors */ public_st_cmap(); /* Because lookup ranges can be elements of arrays, */ /* their enum_ptrs procedure must never return 0 prematurely. */ private ENUM_PTRS_WITH(code_lookup_range_enum_ptrs, gx_code_lookup_range_t *pclr) return 0; case 0: if (pclr->value_type == CODE_VALUE_GLYPH) { const byte *pv = pclr->values.data; int k; for (k = 0; k < pclr->num_keys; ++k) { gs_glyph glyph = 0; int i; for (i = 0; i < pclr->value_size; ++i) glyph = (glyph << 8) + *pv++; pclr->cmap->mark_glyph(glyph, pclr->cmap->mark_glyph_data); } } return ENUM_OBJ(pclr->cmap); case 1: return ENUM_STRING(&pclr->keys); case 2: return ENUM_STRING(&pclr->values); ENUM_PTRS_END private RELOC_PTRS_WITH(code_lookup_range_reloc_ptrs, gx_code_lookup_range_t *pclr) RELOC_VAR(pclr->cmap); RELOC_STRING_VAR(pclr->keys); RELOC_STRING_VAR(pclr->values); RELOC_PTRS_END public_st_code_lookup_range(); public_st_code_lookup_range_element(); /* ---------------- Procedures ---------------- */ /* * Initialize a just-allocated CMap, to ensure that all pointers are clean * for the GC. */ void gs_cmap_init(gs_cmap_t *pcmap) { memset(pcmap, 0, sizeof(*pcmap)); pcmap->id = gs_next_ids(1); uid_set_invalid(&pcmap->uid); } /* * Create an Identity CMap. */ int gs_cmap_create_identity(gs_cmap_t **ppcmap, int num_bytes, int wmode, gs_memory_t *mem) { gs_cmap_t *pcmap = gs_alloc_struct(mem, gs_cmap_t, &st_cmap, "gs_cmap_create_identity(CMap)"); gx_code_space_range_t *range = (gx_code_space_range_t *) gs_alloc_bytes(mem, sizeof(gx_code_space_range_t), "gs_cmap_create_identity(code space range)"); gx_code_lookup_range_t *lookup = gs_alloc_struct_array(mem, 1, gx_code_lookup_range_t, &st_code_lookup_range, "gs_cmap_create_identity(lookup range)"); /* We allocate CIDSystemInfo dynamically only for the sake of the GC. */ gs_cid_system_info_t *pcidsi = gs_alloc_struct(mem, gs_cid_system_info_t, &st_cid_system_info, "gs_cmap_create_identity(CIDSystemInfo)"); static const byte key_data[8] = { 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff }; static const gs_cid_system_info_t identity_cidsi = { { (const byte *)"Adobe", 5 }, { (const byte *)"Identity", 8 }, 0 }; if (pcmap == 0 || range == 0 || lookup == 0 || pcidsi == 0) return_error(gs_error_VMerror); if (num_bytes != 2) /* for now */ return_error(gs_error_rangecheck); gs_cmap_init(pcmap); pcmap->CMapType = 1; pcmap->CMapName.data = (const byte *) (wmode ? "Identity-V" : "Identity-H"); pcmap->CMapName.size = 10; *pcidsi = identity_cidsi; pcmap->CIDSystemInfo = pcidsi; pcmap->num_fonts = 1; pcmap->CMapVersion = 1.0; /* no uid, UIDOffset */ pcmap->WMode = wmode; memset(range->first, 0, num_bytes); memset(range->last, 0xff, num_bytes); range->size = num_bytes; pcmap->code_space.ranges = range; pcmap->code_space.num_ranges = 1; memset(lookup, 0, sizeof(*lookup)); lookup->cmap = pcmap; lookup->key_size = num_bytes; lookup->num_keys = 1; lookup->key_is_range = true; /* * It's OK to break const here, because the strings are never * freed, and the GC can handle strings outside the heap. */ lookup->keys.data = (byte*) (key_data + 4 - num_bytes); lookup->keys.size = num_bytes * 2; lookup->value_type = CODE_VALUE_CID; lookup->value_size = num_bytes; /* ditto */ lookup->values.data = (byte*) key_data; lookup->values.size = num_bytes; pcmap->def.lookup = lookup; pcmap->def.num_lookup = 1; /* no notdef */ /* no mark_glyph, mark_glyph_data, glyph_name, glyph_name_data */ *ppcmap = pcmap; return 0; } /* * multi-dimensional range comparator */ private void print_msg_str_in_range(const byte *str, const byte *key_lo, const byte *key_hi, int key_size) { debug_print_string_hex(str, key_size); dlprintf(" in "); debug_print_string_hex(key_lo, key_size); dlprintf(" - "); debug_print_string_hex(key_hi, key_size); dlprintf("\n"); } private int gs_cmap_get_shortest_chr(const gx_code_map_t * pcmap, uint *pfidx) { int i; int len_shortest = MAX_CMAP_CODE_SIZE; uint fidx_shortest = 0; /* font index for this fallback */ for (i = pcmap->num_lookup - 1; i >= 0; --i) { const gx_code_lookup_range_t *pclr = &pcmap->lookup[i]; if ((pclr->key_prefix_size + pclr->key_size) <= len_shortest) { len_shortest = (pclr->key_prefix_size + pclr->key_size); fidx_shortest = pclr->font_index; } } *pfidx = fidx_shortest; return len_shortest; } /* * multi-dimensional relative position calculator * * Returns offset of the given CID, considering CID range * as array of CIDs (the last index changes fastest). */ private int gs_multidim_CID_offset(const byte *key_str, const byte *key_lo, const byte *key_hi, int key_size) { int i; /* index for current dimension */ int CID_offset = 0; if (gs_debug_c('J')) { dlprintf("[J]gmCo() calc CID_offset for 0x"); print_msg_str_in_range(key_str, key_lo, key_hi, key_size); } for (i = 0; i < key_size; i++) CID_offset = CID_offset * (key_hi[i] - key_lo[i] + 1) + key_str[i] - key_lo[i]; if_debug1('J', "[J]gmCo() CID_offset = %d\n", CID_offset); return CID_offset; } /* Get a big-endian integer. */ private uint bytes2int(const byte *p, int n) { uint v = 0; int i; for (i = 0; i < n; ++i) v = (v << 8) + p[i]; return v; } /* * Decode a character from a string using a code map, updating the index. * Return 0 for a CID or name, N > 0 for a character code where N is the * number of bytes in the code, or an error. Shift the decoded bytes into * *pchr. For undefined characters, set *pglyph = gs_no_glyph and return 0. */ private int code_map_decode_next(const gx_code_map_t * pcmap, const gs_const_string * pstr, uint * pindex, uint * pfidx, gs_char * pchr, gs_glyph * pglyph) { const byte *str = pstr->data + *pindex; uint ssize = pstr->size - *pindex; /* * The keys are not sorted due to 'usecmap'. Possible optimization : * merge and sort keys in 'zbuildcmap', then use binary search here. * This would be valuable for UniJIS-UTF8-H, which contains about 7000 * keys. */ int i; for (i = pcmap->num_lookup - 1; i >= 0; --i) { /* reverse scan order due to 'usecmap' */ const gx_code_lookup_range_t *pclr = &pcmap->lookup[i]; int pre_size = pclr->key_prefix_size, key_size = pclr->key_size, chr_size = pre_size + key_size; if (ssize < chr_size) continue; if (memcmp(str, pclr->key_prefix, pre_size)) continue; /* Search the lookup range. We could use binary search. */ { const byte *key = pclr->keys.data; int step = key_size; int k; const byte *pvalue; if (pclr->key_is_range) { step <<= 1; for (k = 0; k < pclr->num_keys; ++k, key += step) if (memcmp(str + pre_size, key, key_size) >= 0 && memcmp(str + pre_size, key + key_size, key_size) <= 0) break; } else { for (k = 0; k < pclr->num_keys; ++k, key += step) if (!memcmp(str + pre_size, key, key_size)) break; } if (k == pclr->num_keys) continue; /* We have a match. Return the result. */ *pchr = (*pchr << (chr_size * 8)) + bytes2int(str, chr_size); *pindex += chr_size; *pfidx = pclr->font_index; pvalue = pclr->values.data + k * pclr->value_size; switch (pclr->value_type) { case CODE_VALUE_CID: *pglyph = gs_min_cid_glyph + bytes2int(pvalue, pclr->value_size) + bytes2int(str + pre_size, key_size) - bytes2int(key, key_size); return 0; case CODE_VALUE_GLYPH: *pglyph = bytes2int(pvalue, pclr->value_size); return 0; case CODE_VALUE_CHARS: *pglyph = bytes2int(pvalue, pclr->value_size) + bytes2int(str + pre_size, key_size) - bytes2int(key, key_size); return pclr->value_size; default: /* shouldn't happen */ return_error(gs_error_rangecheck); } } } /* No mapping. */ *pglyph = gs_no_glyph; return 0; } private int code_map_decode_next_multidim_regime(const gx_code_map_t * pcmap, const gs_const_string * pstr, uint * pindex, uint * pfidx, gs_char * pchr, gs_glyph * pglyph) { const byte *str = pstr->data + *pindex; uint ssize = pstr->size - *pindex; /* * The keys are not sorted due to 'usecmap'. Possible optimization : * merge and sort keys in 'zbuildcmap', then use binary search here. * This would be valuable for UniJIS-UTF8-H, which contains about 7000 * keys. */ int i; /* * In the fallback of CMap decoding procedure, there is "partial matching". * For detail, refer PostScript Ref. Manual v3 at the end of Fonts chapter. */ /* "pm" stands for partial match (not pointer), temporal use. */ int pm_maxlen = 0; /* partial match: max length */ int pm_index = *pindex; /* partial match: ptr index (in str) */ uint pm_fidx = *pfidx; /* partial match: ptr font index */ gs_char pm_chr = *pchr; /* partial match: ptr character */ *pchr = '\0'; if (gs_debug_c('J')) { dlprintf("[J]CMDNmr() is called: str=("); debug_print_string_hex(str, ssize); dlprintf3(") @ 0x%lx ssize=%d, %d ranges to check\n", str, ssize, pcmap->num_lookup); } for (i = pcmap->num_lookup - 1; i >= 0; --i) { /* main loop - scan the map passed via pcmap */ /* reverse scan order due to 'usecmap' */ const gx_code_lookup_range_t *pclr = &pcmap->lookup[i]; int pre_size = pclr->key_prefix_size, key_size = pclr->key_size, chr_size = pre_size + key_size; int j = 0; /* length of the given byte stream is shorter than * chr-length of current range, no need for further check, * skip to the next range. */ if (ssize < chr_size) continue; if (0 < pre_size) { const byte * prefix = pclr->key_prefix; /* check partial match in prefix */ for (j = 0; j < pre_size; j++) if (prefix[j] != str[j]) break; if (0 == j) /* no match, skip to next i */ continue; else if (j < pre_size) { /* not exact, partial match */ if (gs_debug_c('J')) { dlprintf("[J]CMDNmr() partial match with prefix:"); print_msg_str_in_range(str, prefix, prefix, pre_size); } if (pm_maxlen < j) { pm_maxlen = chr_size; pm_chr = bytes2int(str, chr_size); pm_index = (*pindex) + chr_size; pm_fidx = pclr->font_index; } continue ; /* no need to check key, skip to next i */ } if (gs_debug_c('J')) { dlprintf("[J]CMDNmr() full match with prefix:"); print_msg_str_in_range(str, prefix, prefix, pre_size); } } /* if (0 < pre_size) */ /* full match in prefix. check key */ { const byte *key = pclr->keys.data; int step = key_size; int k, l, m; const byte *pvalue = NULL; /* when range is "range", 2 keys for lo-end and hi-end * are stacked. So twice the step. current "key" points * lo-end of current range, and the pointer for hi-end * is calculated by (key + step - key_size). */ if (pclr->key_is_range) step <<=1; /* step = step * 2; */ for (k = 0; k < pclr->num_keys; ++k, key += step) { if_debug0('j', "[j]CMDNmr() check key:"); if (gs_debug_c('j')) print_msg_str_in_range(str + pre_size, key, key + step - key_size, key_size) ; for (l = 0; l < key_size; l++) { byte c = str[l + pre_size]; if (c < key[l] || c > key[step - key_size + l]) break; } if (pm_maxlen < pre_size + l) { pm_maxlen = chr_size; pm_chr = bytes2int(str, chr_size); pm_index = (*pindex) + chr_size; pm_fidx = pclr->font_index; } if (l == key_size) break; } /* all keys are tried, but found no match. */ /* go to next prefix. */ if (k == pclr->num_keys) continue; /* We have a match. Return the result. */ *pchr = bytes2int(str, chr_size); *pindex += chr_size; *pfidx = pclr->font_index; pvalue = pclr->values.data + k * pclr->value_size; if (gs_debug_c('J')) { dlprintf("[J]CMDNmr() full matched pvalue=("); debug_print_string_hex(pvalue, pclr->value_size); dlprintf(")\n"); } switch (pclr->value_type) { case CODE_VALUE_CID: *pglyph = gs_min_cid_glyph + bytes2int(pvalue, pclr->value_size) + gs_multidim_CID_offset(str + pre_size, key, key + step - key_size, key_size); return 0; case CODE_VALUE_NOTDEF: *pglyph = gs_min_cid_glyph + bytes2int(pvalue, pclr->value_size); return 0; case CODE_VALUE_GLYPH: *pglyph = bytes2int(pvalue, pclr->value_size); return 0; case CODE_VALUE_CHARS: *pglyph = bytes2int(pvalue, pclr->value_size) + bytes2int(str + pre_size, key_size) - bytes2int(key, key_size); return pclr->value_size; default: /* shouldn't happen */ return_error(gs_error_rangecheck); } } } /* No mapping. */ *pchr = pm_chr; *pindex = pm_index; *pfidx = pm_fidx; *pglyph = gs_no_glyph; if (gs_debug_c('J')) { dlprintf("[J]CMDNmr() no full match, use partial match for ("); debug_print_string_hex(str, pm_maxlen); dlprintf(")\n"); } return 0; } /* * Decode a character from a string using a CMap. * Return like code_map_decode_next. * At present, the range specification by (begin|end)codespacerange * is not used in this function. Therefore, this function accepts * some invalid CMap which def & undef maps exceed the codespacerange. * It should be checked in this function, or some procedure in gs_cmap.ps. */ int gs_cmap_decode_next(const gs_cmap_t * pcmap, const gs_const_string * pstr, uint * pindex, uint * pfidx, gs_char * pchr, gs_glyph * pglyph) { uint save_index = *pindex; int code; uint pm_index; uint pm_fidx; gs_char pm_chr; /* For first, check defined map */ if_debug0('J', "[J]GCDN() check def CMap\n"); code = code_map_decode_next_multidim_regime(&pcmap->def, pstr, pindex, pfidx, pchr, pglyph); /* This is defined character */ if (code != 0 || *pglyph != gs_no_glyph) return code; /* In here, this is NOT defined character */ /* save partially matched results */ pm_index = *pindex; pm_fidx = *pfidx; pm_chr = *pchr; /* check notdef map. */ if_debug0('J', "[J]GCDN() check notdef CMap\n"); *pindex = save_index; code = code_map_decode_next_multidim_regime(&pcmap->notdef, pstr, pindex, pfidx, pchr, pglyph); /* This is defined "notdef" character. */ if (code != 0 || *pglyph != gs_no_glyph) return code; /* * This is undefined in def & undef maps, * use partially matched result with default notdef (CID = 0). */ if (save_index < pm_index) { /* there was some partially matched */ *pglyph = gs_min_cid_glyph; /* CID = 0 */ *pindex = pm_index; *pfidx = pm_fidx; *pchr = '\0'; return 0; /* should return some error for partial matched .notdef? */ } else { /* no match */ /* Even partial match is failed. * Getting the shortest length from defined characters, * and take the leading bytes (with same length of the shortest * defined chr) as an unidentified character: CID = 0. * Also this procedure is specified in PS Ref. Manual v3, * at the end of Fonts chapter. */ const byte *str = pstr->data + save_index; uint ssize = pstr->size - save_index; int chr_size_shortest = gs_cmap_get_shortest_chr(&pcmap->def, pfidx); if (chr_size_shortest <= ssize) { *pglyph = gs_min_cid_glyph; /* CID = 0, this is CMap fallback */ *pindex = save_index + chr_size_shortest; *pchr = '\0'; if (gs_debug_c('J')) { dlprintf1("[J]GCDN() no partial match, skip %d byte (", chr_size_shortest); debug_print_string_hex(str, chr_size_shortest); dlprintf(")\n"); } return 0; /* should return some error for fallback .notdef? */ } else { /* Undecodable string is shorter than the shortest character, * there's no way except to return error. */ *pglyph = gs_no_glyph; return -1; if (gs_debug_c('J')) { dlprintf2("[J]GCDN() left data in buffer (%d) is shorter than shortest defined character (%d)\n", ssize, chr_size_shortest); } *pglyph = gs_no_glyph; return_error(e_rangecheck); } } }