/* Copyright 2009 Daniel Martin */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ /* See the License for the specific language governing permissions and */ /* limitations under the License. */ #include "utf8checker.h" #include utf8_decoder_state init_utf8_decoder_state(utf8_decoder_state state) { state->working_len = 0; state->expecting_continuation = 0; state->bytes_so_far = 0; state->unichars_so_far = 0; return state; } void process_byte(uint8_t incoming, utf8_decoder_state state, unicode_handler unih, error_handler errh) { int i, err; uint32_t unichar; uint8_t *working = state->working; if (state->expecting_continuation > 0) { if ((incoming & 0xC0) != 0x80) { // not a continuation char (*errh)(MISSING_CONTINUATION, state->bytes_so_far, state->unichars_so_far, state->working, state->working_len); state->bytes_so_far += state->working_len; state->working_len = 0; state->expecting_continuation = 0; process_byte(incoming, state, unih, errh); // reprocess return; } // else, got a continuation char state->working[state->working_len] = incoming; state->working_len++; state->expecting_continuation--; if (state->expecting_continuation == 0) { // finished the unichar if (((working[0] & 0xFE) == 0xC0) || ((working[0] == 0xE0) && ((working[1] & 0x20) == 0x00)) || ((working[0] == 0xF0) && ((working[1] & 0x30) == 0x00)) || ((working[0] == 0xF8) && ((working[1] & 0x38) == 0x00)) || ((working[0] == 0xFC) && ((working[1] & 0x3C) == 0x00))) { (*errh)(OVERLONG_FORM, state->bytes_so_far, state->unichars_so_far, state->working, state->working_len); state->bytes_so_far += state->working_len; state->working_len = 0; state->expecting_continuation = 0; return; } unichar = 0; if ((working[0] & 0xE0) == 0xC0) { unichar = working[0] & 0x1F; } else if ((working[0] & 0xF0) == 0xE0) { unichar = working[0] & 0x0F; } else if ((working[0] & 0xF8) == 0xF0) { unichar = working[0] & 0x07; } else { unichar = 1; // will be OUT_OF_RANGE } for(i=1; i < state->working_len; i++) { unichar <<= 6; unichar += state->working[i] & 0x3F; } err = 0; if (unichar > 0x10FFFF) { err = OUT_OF_RANGE; } else if ((unichar & 0xF800) == 0xD800) { err = BAD_SCALAR_VALUE; } if (err != 0) { (*errh)(err, state->bytes_so_far, state->unichars_so_far, state->working, state->working_len); state->bytes_so_far += state->working_len; state->working_len = 0; state->expecting_continuation = 0; } else { (*unih)(unichar, state->bytes_so_far, state->unichars_so_far++, state->working, state->working_len); state->bytes_so_far += state->working_len; state->working_len = 0; state->expecting_continuation = 0; } } return; } // expecting_continuation if ((incoming & 0x80) == 0) { // ASCII state->working[0] = incoming; unih(incoming, state->bytes_so_far, state->unichars_so_far++, state->working, 1); state->bytes_so_far++; } else if (((incoming & 0xC0) == 0x80) || ((incoming & 0xFE) == 0xFE)) { // continuation but unexpected, or 0xFE-0xFF if ((incoming & 0xFE) == 0xFE) { err = INVALID; } else { err = UNEXPECTED_CONTINUATION; } state->working[0] = incoming; (*errh)(err, state->bytes_so_far, state->unichars_so_far, state->working, 1); state->bytes_so_far++; state->working_len = 0; state->expecting_continuation = 0; } else { // start of a multi-byte sequence state->working[0] = incoming; state->working_len = 1; if ((incoming & 0xE0) == 0xC0) { // start of 2-char sequence state->expecting_continuation = 1; } else if ((incoming & 0xF0) == 0xE0) { // start of 3-char sequence state->expecting_continuation = 2; } else if ((incoming & 0xF8) == 0xF0) { // start of 4-char sequence state->expecting_continuation = 3; } else if ((incoming & 0xFC) == 0xF8) { // start of 5-char sequence -- will be OUT_OF_RANGE if complete state->expecting_continuation = 4; } else if ((incoming & 0xFE) == 0xFC) { // start of 6-char sequence -- will be OUT_OF_RANGE if complete state->expecting_continuation = 5; } } } void _donothing_unihandler(uint32_t unichar, int bsf, int usf, uint8_t *buf, int buflen) {} void end_processing(utf8_decoder_state state, error_handler errh) { process_byte(0x0, state, _donothing_unihandler, errh); }