+ if ((c & 0xc0) != 0x80)
+ /* wrong continuation byte; invalidate all bytes */
+ complete = 1; /* error */
+ else
+ {
+ codepoint = (codepoint << 6) | (c & 0x3f);
+ seq_buff[index++] = c;
+ if (--bytes_left == 0) /* codepoint complete */
+ if(codepoint > 0x10FFFF) /* is it too large? */
+ complete = -1; /* error (RFC3629 limit) */
+ else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
+ /* A UTF-16 surrogate (which should be one of a pair that
+ encode a Unicode codepoint that is outside the Basic
+ Multilingual Plane). Error, not UTF8.
+ RFC2279.2 is slightly unclear on this, but
+ https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
+ says "Surrogates characters are also invalid in UTF-8:
+ characters in U+D800—U+DFFF have to be rejected." */
+ complete = -1;
+ else
+ { /* finished; output utf-8 sequence */
+ yield = string_catn(yield, seq_buff, seq_len);
+ index = 0;
+ }
+ }