LCOV - coverage.info - lib/unicode.c

LCOV - code coverage report

Current view:	top level - lib - unicode.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	37	42	88.1 %
Date:	2016-09-14 01:02:56	Functions:	4	5	80.0 %
		Branches:	21	26	80.8 %

           Branch data     Line data    Source code

       1                 :            : /*
       2                 :            :  * Copyright (c) 2009, 2010 Nicira, Inc.
       3                 :            :  *
       4                 :            :  * Licensed under the Apache License, Version 2.0 (the "License");
       5                 :            :  * you may not use this file except in compliance with the License.
       6                 :            :  * You may obtain a copy of the License at:
       7                 :            :  *
       8                 :            :  *     http://www.apache.org/licenses/LICENSE-2.0
       9                 :            :  *
      10                 :            :  * Unless required by applicable law or agreed to in writing, software
      11                 :            :  * distributed under the License is distributed on an "AS IS" BASIS,
      12                 :            :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      13                 :            :  * See the License for the specific language governing permissions and
      14                 :            :  * limitations under the License.
      15                 :            :  */
      16                 :            : 
      17                 :            : #include <config.h>
      18                 :            : 
      19                 :            : #include "unicode.h"
      20                 :            : 
      21                 :            : #include <inttypes.h>
      22                 :            : 
      23                 :            : #include "openvswitch/dynamic-string.h"
      24                 :            : #include "util.h"
      25                 :            : 
      26                 :            : /* Returns the unicode code point corresponding to leading surrogate 'leading'
      27                 :            :  * and trailing surrogate 'trailing'.  The return value will not make any
      28                 :            :  * sense if 'leading' or 'trailing' are not in the correct ranges for leading
      29                 :            :  * or trailing surrogates. */
      30                 :            : int
      31                 :          2 : utf16_decode_surrogate_pair(int leading, int trailing)
      32                 :            : {
      33                 :            :     /*
      34                 :            :      *  Leading surrogate:         110110wwwwxxxxxx
      35                 :            :      * Trailing surrogate:         110111xxxxxxxxxx
      36                 :            :      *         Code point: 000uuuuuxxxxxxxxxxxxxxxx
      37                 :            :      */
      38                 :          2 :     int w = (leading >> 6) & 0xf;
      39                 :          2 :     int u = w + 1;
      40                 :          2 :     int x0 = leading & 0x3f;
      41                 :          2 :     int x1 = trailing & 0x3ff;
      42                 :          2 :     return (u << 16) | (x0 << 10) | x1;
      43                 :            : }
      44                 :            : 
      45                 :            : /* Returns the number of Unicode characters in UTF-8 string 's'. */
      46                 :            : size_t
      47                 :          0 : utf8_length(const char *s_)
      48                 :            : {
      49                 :            :     const uint8_t *s;
      50                 :            :     size_t length;
      51                 :            : 
      52                 :          0 :     length = 0;
      53         [ #  # ]:          0 :     for (s = (const uint8_t *) s_; *s != '\0'; s++) {
      54                 :            :         /* The most-significant bits of the first byte in a character are one
      55                 :            :          * of 2#01, 2#00, or 2#11.  2#10 is a continuation byte. */
      56                 :          0 :         length += (*s & 0xc0) != 0x80;
      57                 :            :     }
      58                 :          0 :     return length;
      59                 :            : }
      60                 :            : 
      61                 :            : static char *
      62                 :          2 : invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp)
      63                 :            : {
      64                 :            :     struct ds msg;
      65                 :            :     int i;
      66                 :            : 
      67         [ +  - ]:          2 :     if (lengthp) {
      68                 :          2 :         *lengthp = 0;
      69                 :            :     }
      70                 :            : 
      71                 :          2 :     ds_init(&msg);
      72                 :          2 :     ds_put_cstr(&msg, "invalid UTF-8 sequence");
      73         [ +  + ]:          5 :     for (i = 0; i < n; i++) {
      74                 :          3 :         ds_put_format(&msg, " 0x%02"PRIx8, s[i]);
      75                 :            :     }
      76                 :          2 :     return ds_steal_cstr(&msg);
      77                 :            : }
      78                 :            : 
      79                 :            : struct utf8_sequence {
      80                 :            :     uint8_t octets[5][2];
      81                 :            : };
      82                 :            : 
      83                 :            : static const struct utf8_sequence *
      84                 :          3 : lookup_utf8_sequence(uint8_t c)
      85                 :            : {
      86                 :            :     static const struct utf8_sequence seqs[] = {
      87                 :            :         { { { 0x01, 0x7f },
      88                 :            :             { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },
      89                 :            : 
      90                 :            :         { { { 0xc2, 0xdf }, { 0x80, 0xbf },
      91                 :            :             { 0, 0 }, { 0, 0 }, { 0, 0 } } },
      92                 :            : 
      93                 :            :         { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
      94                 :            :             {0,0}, {0, 0 } } },
      95                 :            : 
      96                 :            :         { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
      97                 :            :             { 0, 0 }, { 0, 0 } } },
      98                 :            : 
      99                 :            :         { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
     100                 :            :             { 0, 0 }, { 0, 0 } } },
     101                 :            : 
     102                 :            :         { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
     103                 :            :             { 0, 0 }, { 0, 0 } } },
     104                 :            : 
     105                 :            :         { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
     106                 :            :             { 0, 0 } } },
     107                 :            : 
     108                 :            :         { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
     109                 :            :             { 0, 0 } } },
     110                 :            : 
     111                 :            :         { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
     112                 :            :             { 0, 0 } } },
     113                 :            :     };
     114                 :            : 
     115                 :            :     size_t i;
     116                 :            : 
     117         [ +  + ]:         22 :     for (i = 0; i < ARRAY_SIZE(seqs); i++) {
     118                 :         21 :         const uint8_t *o = seqs[i].octets[0];
     119 [ +  + ][ +  + ]:         21 :         if (c >= o[0] && c <= o[1]) {
     120                 :          2 :             return &seqs[i];
     121                 :            :         }
     122                 :            :     }
     123                 :          1 :     return NULL;
     124                 :            : }
     125                 :            : 
     126                 :            : /* Checks that 's' is a valid, null-terminated UTF-8 string.  If so, returns a
     127                 :            :  * null pointer and sets '*lengthp' to the number of Unicode characters in
     128                 :            :  * 's'.  If not, returns an error message that the caller must free and sets
     129                 :            :  * '*lengthp' to 0.
     130                 :            :  *
     131                 :            :  * 'lengthp' may be NULL if the length is not needed. */
     132                 :            : char *
     133                 :     647315 : utf8_validate(const char *s_, size_t *lengthp)
     134                 :            : {
     135                 :     647315 :     size_t length = 0;
     136                 :            :     const uint8_t *s;
     137                 :            : 
     138         [ +  + ]:    6600919 :     for (s = (const uint8_t *) s_; *s != '\0'; ) {
     139                 :    5953606 :         length++;
     140         [ +  + ]:    5953606 :         if (s[0] < 0x80) {
     141                 :    5953603 :             s++;
     142                 :            :         } else {
     143                 :            :             const struct utf8_sequence *seq;
     144                 :            :             int i;
     145                 :            : 
     146                 :          3 :             seq = lookup_utf8_sequence(s[0]);
     147         [ +  + ]:          3 :             if (!seq) {
     148                 :          1 :                 return invalid_utf8_sequence(s, 1, lengthp);
     149                 :            :             }
     150                 :            : 
     151         [ +  + ]:          5 :             for (i = 1; seq->octets[i][0]; i++) {
     152                 :          4 :                 const uint8_t *o = seq->octets[i];
     153 [ +  - ][ +  + ]:          4 :                 if (s[i] < o[0] || s[i] > o[1]) {
     154                 :          1 :                     return invalid_utf8_sequence(s, i + 1, lengthp);
     155                 :            :                 }
     156                 :            :             }
     157                 :          1 :             s += i;
     158                 :            :         }
     159                 :            :     }
     160         [ +  - ]:     647313 :     if (lengthp) {
     161                 :     647313 :         *lengthp = length;
     162                 :            :     }
     163                 :     647313 :     return NULL;
     164                 :            : }

Generated by: LCOV version 1.12