#ifndef lint static char *rcsid = "$Id: ucsset.c,v 1.1.1.1 2003-06-04 00:26:15 marka Exp $"; #endif /* * Copyright (c) 2001 Japan Network Information Center. All rights reserved. * * By using this file, you agree to the terms and conditions set forth bellow. * * LICENSE TERMS AND CONDITIONS * * The following License Terms and Conditions apply, unless a different * license is obtained from Japan Network Information Center ("JPNIC"), * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, * Chiyoda-ku, Tokyo 101-0047, Japan. * * 1. Use, Modification and Redistribution (including distribution of any * modified or derived work) in source and/or binary forms is permitted * under this License Terms and Conditions. * * 2. Redistribution of source code must retain the copyright notices as they * appear in each source code file, this License Terms and Conditions. * * 3. Redistribution in binary form must reproduce the Copyright Notice, * this License Terms and Conditions, in the documentation and/or other * materials provided with the distribution. For the purposes of binary * distribution the "Copyright Notice" refers to the following language: * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." * * 4. The name of JPNIC may not be used to endorse or promote products * derived from this Software without specific prior written approval of * JPNIC. * * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #include #include #include #include #include #define UCS_MAX 0x80000000UL #define INIT_SIZE 50 /* * Code point range. * * The set of code points is represented by an array of code point ranges. * In the building phase, specified ranges by 'idn_ucsset_add' or * 'idn_ucsset_addrange' are simply appended to the array. * And 'idn_ucsset_fix' sorts the array by the code point value, and also * merges any intersecting ranges. Since the array is sorted, a binary * search can be used for looking up. */ typedef struct { unsigned long from; unsigned long to; } range_t; /* * Code point segment. * * To speed up searching further, the entire region of UCS-4 code points * (U+0000 - U+7FFFFFFF) are divided into segments. For each segment, * the first and last element of the range array corresponding to the * segment are computed by 'idn_ucsset_fix'. This narrows down the * (initial) search range. */ typedef struct { int range_start; /* index of ucsset.ranges */ int range_end; /* ditto */ } segment_t; /* * Code point to segment index conversion. * * Below is the function that maps a code point to the corresponding segment. * The mapping is non-uniform, so that BMP, the following 16 planes that * comprise Unicode code points together with BMP, and other planes * have different granularity. */ #define SEG_THLD1 0x10000 /* BMP */ #define SEG_THLD2 0x110000 /* Unicode (BMP+16planes) */ #define SEG_SFT1 10 /* BMP: 1K code points/segment */ #define SEG_SFT2 14 /* following 16 planes: 16K cp/seg */ #define SEG_SFT3 24 /* rest: 16M cp/seg */ #define SEG_OFF1 (SEG_THLD1 >> SEG_SFT1) #define SEG_OFF2 (((SEG_THLD2 - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) #define SEG_INDEX(v) \ (((v) < SEG_THLD1) ? ((v) >> SEG_SFT1) : \ ((v) < SEG_THLD2) ? ((((v) - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) : \ ((((v) - SEG_THLD2) >> SEG_SFT3) + SEG_OFF2)) #define SEG_LEN (SEG_INDEX(UCS_MAX - 1) + 1) /* * Representation of set of UCS code points. */ typedef struct idn_ucsset { segment_t segments[SEG_LEN]; int fixed; int size; /* allocated size of 'ranges' */ int nranges; /* num of ranges */ range_t *ranges; int refcnt; /* reference count */ } ucsset; static idn_result_t addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to, char *func_name); static int comp_range(const void *v1, const void *v2); idn_result_t idn_ucsset_create(idn_ucsset_t *ctx) { idn_ucsset_t bm; assert(ctx != NULL); TRACE(("idn_ucsset_create()\n")); if ((bm = malloc(sizeof(ucsset))) == NULL) { WARNING(("idn_ucsset_create: malloc failed\n")); return idn_nomemory; } bm->size = bm->nranges = 0; bm->ranges = NULL; bm->fixed = 0; bm->refcnt = 1; *ctx = bm; return (idn_success); } void idn_ucsset_destroy(idn_ucsset_t ctx) { assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_destroy()\n")); if (--ctx->refcnt == 0) { if (ctx->ranges != NULL) free(ctx->ranges); free(ctx); } } void idn_ucsset_incrref(idn_ucsset_t ctx) { assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_incrref()\n")); ctx->refcnt++; } idn_result_t idn_ucsset_add(idn_ucsset_t ctx, unsigned long v) { assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_add(v=U+%lX)\n", v)); return (addrange(ctx, v, v, "idn_ucsset_add")); } idn_result_t idn_ucsset_addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to) { assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_addrange(from=U+%lX, to=U+%lX)\n", from, to)); return (addrange(ctx, from, to, "idn_ucsset_addrange")); } void idn_ucsset_fix(idn_ucsset_t ctx) { int nranges; range_t *ranges; segment_t *segments; int i, j; assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_fix()\n")); nranges = ctx->nranges; ranges = ctx->ranges; segments = ctx->segments; if (ctx->fixed) return; ctx->fixed = 1; /* Initialize segment array */ for (i = 0; i < SEG_LEN; i++) { segments[i].range_start = -1; segments[i].range_end = -1; } /* If the set is empty, there's nothing to be done. */ if (nranges == 0) return; /* Sort ranges. */ qsort(ranges, nranges, sizeof(range_t), comp_range); /* Merge overlapped/continuous ranges. */ for (i = 0, j = 1; j < nranges; j++) { if (ranges[i].to + 1 >= ranges[j].from) { /* can be merged */ if (ranges[i].to < ranges[j].to) { ranges[i].to = ranges[j].to; } } else { i++; if (i < j) ranges[i] = ranges[j]; } } /* 'i' points the last range in the array. */ ctx->nranges = nranges = ++i; /* Create segment array. */ for (i = 0; i < nranges; i++) { int fidx = SEG_INDEX(ranges[i].from); int tidx = SEG_INDEX(ranges[i].to); for (j = fidx; j <= tidx; j++) { if (segments[j].range_start < 0) segments[j].range_start = i; segments[j].range_end = i; } } #if 0 /* * Does the standard guarantee realloc() always succeeds * when shrinking? */ /* Shrink malloc'ed space if possible. */ ctx->ranges = realloc(ctx->ranges, ctx->nranges * sizeof(range_t)); #endif } idn_result_t idn_ucsset_lookup(idn_ucsset_t ctx, unsigned long v, int *found) { int idx; segment_t *segments; assert(ctx != NULL && ctx->refcnt > 0 && found != NULL); TRACE(("idn_ucsset_lookup(v=U+%lX)\n", v)); /* Make sure it is fixed. */ if (!ctx->fixed) { WARNING(("idn_ucsset_lookup: not fixed yet\n")); return (idn_failure); } /* Check the given code point. */ if (v >= UCS_MAX) return (idn_invalid_codepoint); /* Get the segment 'v' belongs to. */ segments = ctx->segments; idx = SEG_INDEX(v); /* Do binary search. */ *found = 0; if (segments[idx].range_start >= 0) { int lo = segments[idx].range_start; int hi = segments[idx].range_end; range_t *ranges = ctx->ranges; while (lo <= hi) { int mid = (lo + hi) / 2; if (v < ranges[mid].from) { hi = mid - 1; } else if (v > ranges[mid].to) { lo = mid + 1; } else { *found = 1; break; } } } return (idn_success); } static idn_result_t addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to, char *func_name) { range_t *newbuf; /* Check the given code points. */ if (from > UCS_MAX) { WARNING(("%s: code point out of range (U+%lX)\n", func_name, from)); return (idn_invalid_codepoint); } else if (to > UCS_MAX) { WARNING(("%s: code point out of range (U+%lX)\n", func_name, to)); return (idn_invalid_codepoint); } else if (from > to) { WARNING(("%s: invalid range spec (U+%lX-U+%lX)\n", func_name, from, to)); return (idn_invalid_codepoint); } /* Make sure it is not fixed yet. */ if (ctx->fixed) { WARNING(("%s: attempt to add to already fixed object\n", func_name)); return (idn_failure); } /* Append the specified range to the 'ranges' array. */ if (ctx->nranges >= ctx->size) { /* Make it bigger. */ if (ctx->size == 0) ctx->size = INIT_SIZE; else ctx->size *= 2; newbuf = realloc(ctx->ranges, ctx->size * sizeof(range_t)); if (newbuf == NULL) return (idn_nomemory); ctx->ranges = newbuf; } ctx->ranges[ctx->nranges].from = from; ctx->ranges[ctx->nranges].to = to; ctx->nranges++; return (idn_success); } static int comp_range(const void *v1, const void *v2) { /* * Range comparation function suitable for qsort(). */ const range_t *r1 = v1; const range_t *r2 = v2; if (r1->from < r2->from) return (-1); else if (r1->from > r2->from) return (1); else return (0); }