root/fs/unicode/utf8-norm.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. utf8version_is_supported
  2. utf8version_latest
  3. utf8clen
  4. utf8decode3
  5. utf8encode3
  6. utf8hangul
  7. utf8nlookup
  8. utf8lookup
  9. utf8agemax
  10. utf8agemin
  11. utf8nagemax
  12. utf8nagemin
  13. utf8len
  14. utf8nlen
  15. utf8ncursor
  16. utf8cursor
  17. utf8byte
  18. utf8nfdi
  19. utf8nfdicf

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (c) 2014 SGI.
   4  * All rights reserved.
   5  */
   6 
   7 #include "utf8n.h"
   8 
   9 struct utf8data {
  10         unsigned int maxage;
  11         unsigned int offset;
  12 };
  13 
  14 #define __INCLUDED_FROM_UTF8NORM_C__
  15 #include "utf8data.h"
  16 #undef __INCLUDED_FROM_UTF8NORM_C__
  17 
  18 int utf8version_is_supported(u8 maj, u8 min, u8 rev)
  19 {
  20         int i = ARRAY_SIZE(utf8agetab) - 1;
  21         unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
  22 
  23         while (i >= 0 && utf8agetab[i] != 0) {
  24                 if (sb_utf8version == utf8agetab[i])
  25                         return 1;
  26                 i--;
  27         }
  28         return 0;
  29 }
  30 EXPORT_SYMBOL(utf8version_is_supported);
  31 
  32 int utf8version_latest(void)
  33 {
  34         return utf8vers;
  35 }
  36 EXPORT_SYMBOL(utf8version_latest);
  37 
  38 /*
  39  * UTF-8 valid ranges.
  40  *
  41  * The UTF-8 encoding spreads the bits of a 32bit word over several
  42  * bytes. This table gives the ranges that can be held and how they'd
  43  * be represented.
  44  *
  45  * 0x00000000 0x0000007F: 0xxxxxxx
  46  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
  47  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  48  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  49  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  50  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  51  *
  52  * There is an additional requirement on UTF-8, in that only the
  53  * shortest representation of a 32bit value is to be used.  A decoder
  54  * must not decode sequences that do not satisfy this requirement.
  55  * Thus the allowed ranges have a lower bound.
  56  *
  57  * 0x00000000 0x0000007F: 0xxxxxxx
  58  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
  59  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  60  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  61  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  62  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  63  *
  64  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
  65  * 17 planes of 65536 values.  This limits the sequences actually seen
  66  * even more, to just the following.
  67  *
  68  *          0 -     0x7F: 0                   - 0x7F
  69  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
  70  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
  71  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
  72  *
  73  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
  74  *
  75  * Note that the longest sequence seen with valid usage is 4 bytes,
  76  * the same a single UTF-32 character.  This makes the UTF-8
  77  * representation of Unicode strictly smaller than UTF-32.
  78  *
  79  * The shortest sequence requirement was introduced by:
  80  *    Corrigendum #1: UTF-8 Shortest Form
  81  * It can be found here:
  82  *    http://www.unicode.org/versions/corrigendum1.html
  83  *
  84  */
  85 
  86 /*
  87  * Return the number of bytes used by the current UTF-8 sequence.
  88  * Assumes the input points to the first byte of a valid UTF-8
  89  * sequence.
  90  */
  91 static inline int utf8clen(const char *s)
  92 {
  93         unsigned char c = *s;
  94 
  95         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
  96 }
  97 
  98 /*
  99  * Decode a 3-byte UTF-8 sequence.
 100  */
 101 static unsigned int
 102 utf8decode3(const char *str)
 103 {
 104         unsigned int            uc;
 105 
 106         uc = *str++ & 0x0F;
 107         uc <<= 6;
 108         uc |= *str++ & 0x3F;
 109         uc <<= 6;
 110         uc |= *str++ & 0x3F;
 111 
 112         return uc;
 113 }
 114 
 115 /*
 116  * Encode a 3-byte UTF-8 sequence.
 117  */
 118 static int
 119 utf8encode3(char *str, unsigned int val)
 120 {
 121         str[2] = (val & 0x3F) | 0x80;
 122         val >>= 6;
 123         str[1] = (val & 0x3F) | 0x80;
 124         val >>= 6;
 125         str[0] = val | 0xE0;
 126 
 127         return 3;
 128 }
 129 
 130 /*
 131  * utf8trie_t
 132  *
 133  * A compact binary tree, used to decode UTF-8 characters.
 134  *
 135  * Internal nodes are one byte for the node itself, and up to three
 136  * bytes for an offset into the tree.  The first byte contains the
 137  * following information:
 138  *  NEXTBYTE  - flag        - advance to next byte if set
 139  *  BITNUM    - 3 bit field - the bit number to tested
 140  *  OFFLEN    - 2 bit field - number of bytes in the offset
 141  * if offlen == 0 (non-branching node)
 142  *  RIGHTPATH - 1 bit field - set if the following node is for the
 143  *                            right-hand path (tested bit is set)
 144  *  TRIENODE  - 1 bit field - set if the following node is an internal
 145  *                            node, otherwise it is a leaf node
 146  * if offlen != 0 (branching node)
 147  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 148  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 149  *
 150  * Due to the way utf8 works, there cannot be branching nodes with
 151  * NEXTBYTE set, and moreover those nodes always have a righthand
 152  * descendant.
 153  */
 154 typedef const unsigned char utf8trie_t;
 155 #define BITNUM          0x07
 156 #define NEXTBYTE        0x08
 157 #define OFFLEN          0x30
 158 #define OFFLEN_SHIFT    4
 159 #define RIGHTPATH       0x40
 160 #define TRIENODE        0x80
 161 #define RIGHTNODE       0x40
 162 #define LEFTNODE        0x80
 163 
 164 /*
 165  * utf8leaf_t
 166  *
 167  * The leaves of the trie are embedded in the trie, and so the same
 168  * underlying datatype: unsigned char.
 169  *
 170  * leaf[0]: The unicode version, stored as a generation number that is
 171  *          an index into utf8agetab[].  With this we can filter code
 172  *          points based on the unicode version in which they were
 173  *          defined.  The CCC of a non-defined code point is 0.
 174  * leaf[1]: Canonical Combining Class. During normalization, we need
 175  *          to do a stable sort into ascending order of all characters
 176  *          with a non-zero CCC that occur between two characters with
 177  *          a CCC of 0, or at the begin or end of a string.
 178  *          The unicode standard guarantees that all CCC values are
 179  *          between 0 and 254 inclusive, which leaves 255 available as
 180  *          a special value.
 181  *          Code points with CCC 0 are known as stoppers.
 182  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 183  *          start of a NUL-terminated string that is the decomposition
 184  *          of the character.
 185  *          The CCC of a decomposable character is the same as the CCC
 186  *          of the first character of its decomposition.
 187  *          Some characters decompose as the empty string: these are
 188  *          characters with the Default_Ignorable_Code_Point property.
 189  *          These do affect normalization, as they all have CCC 0.
 190  *
 191  * The decompositions in the trie have been fully expanded, with the
 192  * exception of Hangul syllables, which are decomposed algorithmically.
 193  *
 194  * Casefolding, if applicable, is also done using decompositions.
 195  *
 196  * The trie is constructed in such a way that leaves exist for all
 197  * UTF-8 sequences that match the criteria from the "UTF-8 valid
 198  * ranges" comment above, and only for those sequences.  Therefore a
 199  * lookup in the trie can be used to validate the UTF-8 input.
 200  */
 201 typedef const unsigned char utf8leaf_t;
 202 
 203 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 204 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 205 #define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
 206 
 207 #define MINCCC          (0)
 208 #define MAXCCC          (254)
 209 #define STOPPER         (0)
 210 #define DECOMPOSE       (255)
 211 
 212 /* Marker for hangul syllable decomposition. */
 213 #define HANGUL          ((char)(255))
 214 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
 215 #define UTF8HANGULLEAF  (12)
 216 
 217 /*
 218  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 219  *
 220  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 221  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 222  *
 223  * SBase = 0xAC00
 224  * LBase = 0x1100
 225  * VBase = 0x1161
 226  * TBase = 0x11A7
 227  * LCount = 19
 228  * VCount = 21
 229  * TCount = 28
 230  * NCount = 588 (VCount * TCount)
 231  * SCount = 11172 (LCount * NCount)
 232  *
 233  * Decomposition:
 234  *   SIndex = s - SBase
 235  *
 236  * LV (Canonical/Full)
 237  *   LIndex = SIndex / NCount
 238  *   VIndex = (Sindex % NCount) / TCount
 239  *   LPart = LBase + LIndex
 240  *   VPart = VBase + VIndex
 241  *
 242  * LVT (Canonical)
 243  *   LVIndex = (SIndex / TCount) * TCount
 244  *   TIndex = (Sindex % TCount)
 245  *   LVPart = SBase + LVIndex
 246  *   TPart = TBase + TIndex
 247  *
 248  * LVT (Full)
 249  *   LIndex = SIndex / NCount
 250  *   VIndex = (Sindex % NCount) / TCount
 251  *   TIndex = (Sindex % TCount)
 252  *   LPart = LBase + LIndex
 253  *   VPart = VBase + VIndex
 254  *   if (TIndex == 0) {
 255  *          d = <LPart, VPart>
 256  *   } else {
 257  *          TPart = TBase + TIndex
 258  *          d = <LPart, TPart, VPart>
 259  *   }
 260  */
 261 
 262 /* Constants */
 263 #define SB      (0xAC00)
 264 #define LB      (0x1100)
 265 #define VB      (0x1161)
 266 #define TB      (0x11A7)
 267 #define LC      (19)
 268 #define VC      (21)
 269 #define TC      (28)
 270 #define NC      (VC * TC)
 271 #define SC      (LC * NC)
 272 
 273 /* Algorithmic decomposition of hangul syllable. */
 274 static utf8leaf_t *
 275 utf8hangul(const char *str, unsigned char *hangul)
 276 {
 277         unsigned int    si;
 278         unsigned int    li;
 279         unsigned int    vi;
 280         unsigned int    ti;
 281         unsigned char   *h;
 282 
 283         /* Calculate the SI, LI, VI, and TI values. */
 284         si = utf8decode3(str) - SB;
 285         li = si / NC;
 286         vi = (si % NC) / TC;
 287         ti = si % TC;
 288 
 289         /* Fill in base of leaf. */
 290         h = hangul;
 291         LEAF_GEN(h) = 2;
 292         LEAF_CCC(h) = DECOMPOSE;
 293         h += 2;
 294 
 295         /* Add LPart, a 3-byte UTF-8 sequence. */
 296         h += utf8encode3((char *)h, li + LB);
 297 
 298         /* Add VPart, a 3-byte UTF-8 sequence. */
 299         h += utf8encode3((char *)h, vi + VB);
 300 
 301         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
 302         if (ti)
 303                 h += utf8encode3((char *)h, ti + TB);
 304 
 305         /* Terminate string. */
 306         h[0] = '\0';
 307 
 308         return hangul;
 309 }
 310 
 311 /*
 312  * Use trie to scan s, touching at most len bytes.
 313  * Returns the leaf if one exists, NULL otherwise.
 314  *
 315  * A non-NULL return guarantees that the UTF-8 sequence starting at s
 316  * is well-formed and corresponds to a known unicode code point.  The
 317  * shorthand for this will be "is valid UTF-8 unicode".
 318  */
 319 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
 320                                unsigned char *hangul, const char *s, size_t len)
 321 {
 322         utf8trie_t      *trie = NULL;
 323         int             offlen;
 324         int             offset;
 325         int             mask;
 326         int             node;
 327 
 328         if (!data)
 329                 return NULL;
 330         if (len == 0)
 331                 return NULL;
 332 
 333         trie = utf8data + data->offset;
 334         node = 1;
 335         while (node) {
 336                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
 337                 if (*trie & NEXTBYTE) {
 338                         if (--len == 0)
 339                                 return NULL;
 340                         s++;
 341                 }
 342                 mask = 1 << (*trie & BITNUM);
 343                 if (*s & mask) {
 344                         /* Right leg */
 345                         if (offlen) {
 346                                 /* Right node at offset of trie */
 347                                 node = (*trie & RIGHTNODE);
 348                                 offset = trie[offlen];
 349                                 while (--offlen) {
 350                                         offset <<= 8;
 351                                         offset |= trie[offlen];
 352                                 }
 353                                 trie += offset;
 354                         } else if (*trie & RIGHTPATH) {
 355                                 /* Right node after this node */
 356                                 node = (*trie & TRIENODE);
 357                                 trie++;
 358                         } else {
 359                                 /* No right node. */
 360                                 return NULL;
 361                         }
 362                 } else {
 363                         /* Left leg */
 364                         if (offlen) {
 365                                 /* Left node after this node. */
 366                                 node = (*trie & LEFTNODE);
 367                                 trie += offlen + 1;
 368                         } else if (*trie & RIGHTPATH) {
 369                                 /* No left node. */
 370                                 return NULL;
 371                         } else {
 372                                 /* Left node after this node */
 373                                 node = (*trie & TRIENODE);
 374                                 trie++;
 375                         }
 376                 }
 377         }
 378         /*
 379          * Hangul decomposition is done algorithmically. These are the
 380          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
 381          * always 3 bytes long, so s has been advanced twice, and the
 382          * start of the sequence is at s-2.
 383          */
 384         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
 385                 trie = utf8hangul(s - 2, hangul);
 386         return trie;
 387 }
 388 
 389 /*
 390  * Use trie to scan s.
 391  * Returns the leaf if one exists, NULL otherwise.
 392  *
 393  * Forwards to utf8nlookup().
 394  */
 395 static utf8leaf_t *utf8lookup(const struct utf8data *data,
 396                               unsigned char *hangul, const char *s)
 397 {
 398         return utf8nlookup(data, hangul, s, (size_t)-1);
 399 }
 400 
 401 /*
 402  * Maximum age of any character in s.
 403  * Return -1 if s is not valid UTF-8 unicode.
 404  * Return 0 if only non-assigned code points are used.
 405  */
 406 int utf8agemax(const struct utf8data *data, const char *s)
 407 {
 408         utf8leaf_t      *leaf;
 409         int             age = 0;
 410         int             leaf_age;
 411         unsigned char   hangul[UTF8HANGULLEAF];
 412 
 413         if (!data)
 414                 return -1;
 415 
 416         while (*s) {
 417                 leaf = utf8lookup(data, hangul, s);
 418                 if (!leaf)
 419                         return -1;
 420 
 421                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 422                 if (leaf_age <= data->maxage && leaf_age > age)
 423                         age = leaf_age;
 424                 s += utf8clen(s);
 425         }
 426         return age;
 427 }
 428 EXPORT_SYMBOL(utf8agemax);
 429 
 430 /*
 431  * Minimum age of any character in s.
 432  * Return -1 if s is not valid UTF-8 unicode.
 433  * Return 0 if non-assigned code points are used.
 434  */
 435 int utf8agemin(const struct utf8data *data, const char *s)
 436 {
 437         utf8leaf_t      *leaf;
 438         int             age;
 439         int             leaf_age;
 440         unsigned char   hangul[UTF8HANGULLEAF];
 441 
 442         if (!data)
 443                 return -1;
 444         age = data->maxage;
 445         while (*s) {
 446                 leaf = utf8lookup(data, hangul, s);
 447                 if (!leaf)
 448                         return -1;
 449                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 450                 if (leaf_age <= data->maxage && leaf_age < age)
 451                         age = leaf_age;
 452                 s += utf8clen(s);
 453         }
 454         return age;
 455 }
 456 EXPORT_SYMBOL(utf8agemin);
 457 
 458 /*
 459  * Maximum age of any character in s, touch at most len bytes.
 460  * Return -1 if s is not valid UTF-8 unicode.
 461  */
 462 int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
 463 {
 464         utf8leaf_t      *leaf;
 465         int             age = 0;
 466         int             leaf_age;
 467         unsigned char   hangul[UTF8HANGULLEAF];
 468 
 469         if (!data)
 470                 return -1;
 471 
 472         while (len && *s) {
 473                 leaf = utf8nlookup(data, hangul, s, len);
 474                 if (!leaf)
 475                         return -1;
 476                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 477                 if (leaf_age <= data->maxage && leaf_age > age)
 478                         age = leaf_age;
 479                 len -= utf8clen(s);
 480                 s += utf8clen(s);
 481         }
 482         return age;
 483 }
 484 EXPORT_SYMBOL(utf8nagemax);
 485 
 486 /*
 487  * Maximum age of any character in s, touch at most len bytes.
 488  * Return -1 if s is not valid UTF-8 unicode.
 489  */
 490 int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
 491 {
 492         utf8leaf_t      *leaf;
 493         int             leaf_age;
 494         int             age;
 495         unsigned char   hangul[UTF8HANGULLEAF];
 496 
 497         if (!data)
 498                 return -1;
 499         age = data->maxage;
 500         while (len && *s) {
 501                 leaf = utf8nlookup(data, hangul, s, len);
 502                 if (!leaf)
 503                         return -1;
 504                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 505                 if (leaf_age <= data->maxage && leaf_age < age)
 506                         age = leaf_age;
 507                 len -= utf8clen(s);
 508                 s += utf8clen(s);
 509         }
 510         return age;
 511 }
 512 EXPORT_SYMBOL(utf8nagemin);
 513 
 514 /*
 515  * Length of the normalization of s.
 516  * Return -1 if s is not valid UTF-8 unicode.
 517  *
 518  * A string of Default_Ignorable_Code_Point has length 0.
 519  */
 520 ssize_t utf8len(const struct utf8data *data, const char *s)
 521 {
 522         utf8leaf_t      *leaf;
 523         size_t          ret = 0;
 524         unsigned char   hangul[UTF8HANGULLEAF];
 525 
 526         if (!data)
 527                 return -1;
 528         while (*s) {
 529                 leaf = utf8lookup(data, hangul, s);
 530                 if (!leaf)
 531                         return -1;
 532                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 533                         ret += utf8clen(s);
 534                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 535                         ret += strlen(LEAF_STR(leaf));
 536                 else
 537                         ret += utf8clen(s);
 538                 s += utf8clen(s);
 539         }
 540         return ret;
 541 }
 542 EXPORT_SYMBOL(utf8len);
 543 
 544 /*
 545  * Length of the normalization of s, touch at most len bytes.
 546  * Return -1 if s is not valid UTF-8 unicode.
 547  */
 548 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
 549 {
 550         utf8leaf_t      *leaf;
 551         size_t          ret = 0;
 552         unsigned char   hangul[UTF8HANGULLEAF];
 553 
 554         if (!data)
 555                 return -1;
 556         while (len && *s) {
 557                 leaf = utf8nlookup(data, hangul, s, len);
 558                 if (!leaf)
 559                         return -1;
 560                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 561                         ret += utf8clen(s);
 562                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 563                         ret += strlen(LEAF_STR(leaf));
 564                 else
 565                         ret += utf8clen(s);
 566                 len -= utf8clen(s);
 567                 s += utf8clen(s);
 568         }
 569         return ret;
 570 }
 571 EXPORT_SYMBOL(utf8nlen);
 572 
 573 /*
 574  * Set up an utf8cursor for use by utf8byte().
 575  *
 576  *   u8c    : pointer to cursor.
 577  *   data   : const struct utf8data to use for normalization.
 578  *   s      : string.
 579  *   len    : length of s.
 580  *
 581  * Returns -1 on error, 0 on success.
 582  */
 583 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
 584                 const char *s, size_t len)
 585 {
 586         if (!data)
 587                 return -1;
 588         if (!s)
 589                 return -1;
 590         u8c->data = data;
 591         u8c->s = s;
 592         u8c->p = NULL;
 593         u8c->ss = NULL;
 594         u8c->sp = NULL;
 595         u8c->len = len;
 596         u8c->slen = 0;
 597         u8c->ccc = STOPPER;
 598         u8c->nccc = STOPPER;
 599         /* Check we didn't clobber the maximum length. */
 600         if (u8c->len != len)
 601                 return -1;
 602         /* The first byte of s may not be an utf8 continuation. */
 603         if (len > 0 && (*s & 0xC0) == 0x80)
 604                 return -1;
 605         return 0;
 606 }
 607 EXPORT_SYMBOL(utf8ncursor);
 608 
 609 /*
 610  * Set up an utf8cursor for use by utf8byte().
 611  *
 612  *   u8c    : pointer to cursor.
 613  *   data   : const struct utf8data to use for normalization.
 614  *   s      : NUL-terminated string.
 615  *
 616  * Returns -1 on error, 0 on success.
 617  */
 618 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
 619                const char *s)
 620 {
 621         return utf8ncursor(u8c, data, s, (unsigned int)-1);
 622 }
 623 EXPORT_SYMBOL(utf8cursor);
 624 
 625 /*
 626  * Get one byte from the normalized form of the string described by u8c.
 627  *
 628  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
 629  *
 630  * The cursor keeps track of the location in the string in u8c->s.
 631  * When a character is decomposed, the current location is stored in
 632  * u8c->p, and u8c->s is set to the start of the decomposition. Note
 633  * that bytes from a decomposition do not count against u8c->len.
 634  *
 635  * Characters are emitted if they match the current CCC in u8c->ccc.
 636  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
 637  * and the function returns 0 in that case.
 638  *
 639  * Sorting by CCC is done by repeatedly scanning the string.  The
 640  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
 641  * the start of the scan.  The first pass finds the lowest CCC to be
 642  * emitted and stores it in u8c->nccc, the second pass emits the
 643  * characters with this CCC and finds the next lowest CCC. This limits
 644  * the number of passes to 1 + the number of different CCCs in the
 645  * sequence being scanned.
 646  *
 647  * Therefore:
 648  *  u8c->p  != NULL -> a decomposition is being scanned.
 649  *  u8c->ss != NULL -> this is a repeating scan.
 650  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
 651  */
 652 int utf8byte(struct utf8cursor *u8c)
 653 {
 654         utf8leaf_t *leaf;
 655         int ccc;
 656 
 657         for (;;) {
 658                 /* Check for the end of a decomposed character. */
 659                 if (u8c->p && *u8c->s == '\0') {
 660                         u8c->s = u8c->p;
 661                         u8c->p = NULL;
 662                 }
 663 
 664                 /* Check for end-of-string. */
 665                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
 666                         /* There is no next byte. */
 667                         if (u8c->ccc == STOPPER)
 668                                 return 0;
 669                         /* End-of-string during a scan counts as a stopper. */
 670                         ccc = STOPPER;
 671                         goto ccc_mismatch;
 672                 } else if ((*u8c->s & 0xC0) == 0x80) {
 673                         /* This is a continuation of the current character. */
 674                         if (!u8c->p)
 675                                 u8c->len--;
 676                         return (unsigned char)*u8c->s++;
 677                 }
 678 
 679                 /* Look up the data for the current character. */
 680                 if (u8c->p) {
 681                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 682                 } else {
 683                         leaf = utf8nlookup(u8c->data, u8c->hangul,
 684                                            u8c->s, u8c->len);
 685                 }
 686 
 687                 /* No leaf found implies that the input is a binary blob. */
 688                 if (!leaf)
 689                         return -1;
 690 
 691                 ccc = LEAF_CCC(leaf);
 692                 /* Characters that are too new have CCC 0. */
 693                 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
 694                         ccc = STOPPER;
 695                 } else if (ccc == DECOMPOSE) {
 696                         u8c->len -= utf8clen(u8c->s);
 697                         u8c->p = u8c->s + utf8clen(u8c->s);
 698                         u8c->s = LEAF_STR(leaf);
 699                         /* Empty decomposition implies CCC 0. */
 700                         if (*u8c->s == '\0') {
 701                                 if (u8c->ccc == STOPPER)
 702                                         continue;
 703                                 ccc = STOPPER;
 704                                 goto ccc_mismatch;
 705                         }
 706 
 707                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 708                         if (!leaf)
 709                                 return -1;
 710                         ccc = LEAF_CCC(leaf);
 711                 }
 712 
 713                 /*
 714                  * If this is not a stopper, then see if it updates
 715                  * the next canonical class to be emitted.
 716                  */
 717                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
 718                         u8c->nccc = ccc;
 719 
 720                 /*
 721                  * Return the current byte if this is the current
 722                  * combining class.
 723                  */
 724                 if (ccc == u8c->ccc) {
 725                         if (!u8c->p)
 726                                 u8c->len--;
 727                         return (unsigned char)*u8c->s++;
 728                 }
 729 
 730                 /* Current combining class mismatch. */
 731 ccc_mismatch:
 732                 if (u8c->nccc == STOPPER) {
 733                         /*
 734                          * Scan forward for the first canonical class
 735                          * to be emitted.  Save the position from
 736                          * which to restart.
 737                          */
 738                         u8c->ccc = MINCCC - 1;
 739                         u8c->nccc = ccc;
 740                         u8c->sp = u8c->p;
 741                         u8c->ss = u8c->s;
 742                         u8c->slen = u8c->len;
 743                         if (!u8c->p)
 744                                 u8c->len -= utf8clen(u8c->s);
 745                         u8c->s += utf8clen(u8c->s);
 746                 } else if (ccc != STOPPER) {
 747                         /* Not a stopper, and not the ccc we're emitting. */
 748                         if (!u8c->p)
 749                                 u8c->len -= utf8clen(u8c->s);
 750                         u8c->s += utf8clen(u8c->s);
 751                 } else if (u8c->nccc != MAXCCC + 1) {
 752                         /* At a stopper, restart for next ccc. */
 753                         u8c->ccc = u8c->nccc;
 754                         u8c->nccc = MAXCCC + 1;
 755                         u8c->s = u8c->ss;
 756                         u8c->p = u8c->sp;
 757                         u8c->len = u8c->slen;
 758                 } else {
 759                         /* All done, proceed from here. */
 760                         u8c->ccc = STOPPER;
 761                         u8c->nccc = STOPPER;
 762                         u8c->sp = NULL;
 763                         u8c->ss = NULL;
 764                         u8c->slen = 0;
 765                 }
 766         }
 767 }
 768 EXPORT_SYMBOL(utf8byte);
 769 
 770 const struct utf8data *utf8nfdi(unsigned int maxage)
 771 {
 772         int i = ARRAY_SIZE(utf8nfdidata) - 1;
 773 
 774         while (maxage < utf8nfdidata[i].maxage)
 775                 i--;
 776         if (maxage > utf8nfdidata[i].maxage)
 777                 return NULL;
 778         return &utf8nfdidata[i];
 779 }
 780 EXPORT_SYMBOL(utf8nfdi);
 781 
 782 const struct utf8data *utf8nfdicf(unsigned int maxage)
 783 {
 784         int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
 785 
 786         while (maxage < utf8nfdicfdata[i].maxage)
 787                 i--;
 788         if (maxage > utf8nfdicfdata[i].maxage)
 789                 return NULL;
 790         return &utf8nfdicfdata[i];
 791 }
 792 EXPORT_SYMBOL(utf8nfdicf);

/* [<][>][^][v][top][bottom][index][help] */