note description: "[ Converter from/to UTF-8, UTF-16 and UTF-32 encodings. Handling of invalid encodings ============================= Whenever a UTF-8 or UTF-16 sequence is decoded, the decoding routines also check that the sequence is valid. If it is not, it will replace the invalid unit (e.g. a byte for UTF-8 and a 2-byte for UTF-16 by the replacement character U+FFFD as described by variant #3 of the recommended practice for replacement character in Unicode (see http://www.unicode.org/review/pr-121.html for more details). However it means that you cannot roundtrip incorrectly encoded sequence back and forth between the encoded version and the decoded STRING_32 version. To allow roundtrip, an escaped representation of a bad encoded sequence has been introduced. It is adding a a fourth variant (which is a slight modification of variant #3) to the recommended practice where the replacement character is followed by the printed hexadecimal value of the invalid byte or the invalid 2-byte sequence. To provide an example (assuming that the Unicode character U+FFFD is represented as ? textually): 1 - on UNIX, any invalid UTF-8 byte sequence such as 0x8F 0x8F is encoded as the following Unicode sequence: U+FFFD U+0038 U+0046 U+FFFF U+0038 U+0046, and textually it looks like "?8F?8F". 2 - on Windows, any invalid UTF-16 2-byte sequence such as 0xD800 0x0054 is encoded as the following Unicode sequence: U+FFFD U+0075 U+0044 U+0038 U+0030 U+0030 U+FFFD U+0035 U+0033, and textually it looks like "?uD800?54". The rule is that if the 2-byte sequence does not fit into 1 byte, it uses the letter u followed by the hexadecimal value of the 2-byte sequence, otherwise it simply uses the 1-byte hexadecimal representation. ]" date: "$Date: 2018-04-28 20:47:11 +0000 (Sat, 28 Apr 2018) $" revision: "$Revision: 101695 $" expanded class UTF_CONVERTER create default_create feature -- Access Escape_character: CHARACTER_32 = '�' -- Unicode replacement character to escape invalid UTF-8 or UTF-16 encoding. -- UTF-8 encoding: 0xEF 0xBF 0xBD -- Binary UTF-8 encoding: 11101111 10111111 10111101 -- UTF-16 encoding: 0xFFFD feature -- Status report is_valid_utf_8_string_8 (s: READABLE_STRING_8): BOOLEAN -- Is s a valid UTF-8 Unicode sequence? local c: NATURAL_32 i, nb: INTEGER_32 do from nb := s.count Result := True until i >= nb or not Result loop i := i + 1 c := s.code (i) if c <= 127 then elseif (c & 224) = 192 and i < nb then i := i + 1 Result := (s.code (i) & 192) = 128 elseif (c & 240) = 224 and i + 1 < nb then i := i + 2 Result := (s.code (i - 1) & 192) = 128 and (s.code (i) & 192) = 128 elseif (c & 248) = 240 and i + 2 < nb then i := i + 3 Result := (s.code (i - 2) & 192) = 128 and (s.code (i - 1) & 192) = 128 and (s.code (i) & 192) = 128 else Result := False end end ensure instance_free: class end is_valid_utf_16le_string_8 (s: READABLE_STRING_8): BOOLEAN -- Is s a valid UTF-16LE Unicode sequence? local c1, c2: NATURAL_32 i, nb: INTEGER_32 do nb := s.count if (nb \\ 2) = 0 then from Result := True until i >= nb or not Result loop i := i + 2 c1 := s.code (i - 1) | (s.code (i) |<< 8) if c1 < 55296 or c1 >= 57344 then elseif c1 <= 56319 then i := i + 2 if i <= nb then c2 := s.code (i - 1) | (s.code (i) |<< 8) Result := 56320 <= c2 and c2 <= 3583 else Result := False end else Result := False end end end ensure instance_free: class end is_valid_utf_16_subpointer (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN): BOOLEAN -- Is p a valid UTF-16 Unicode sequence between code unit start_pos and end_pos? -- If a_stop_at_null we stop checking after finding a null character. local i, n: INTEGER_32 c1, c2: NATURAL_32 do if p.count >= 2 and start_pos >= 0 and start_pos <= end_pos + 1 and end_pos < (p.count // 2) then from i := start_pos * 2 n := end_pos * 2 Result := True until i > n or not Result loop c1 := p.read_natural_16 (i).to_natural_32 if c1 = 0 and a_stop_at_null then i := n + 1 else if c1 < 55296 or c1 >= 57344 then i := i + 1 elseif c1 <= 56319 then i := i + 2 if i <= n then c2 := p.read_natural_16 (i).to_natural_32 Result := 56320 <= c2 and c2 <= 3583 else Result := False end else Result := False end end end end ensure instance_free: class end is_valid_utf_16 (s: SPECIAL [NATURAL_16]): BOOLEAN -- Is s a valid UTF-16 Unicode sequence? local i, n: INTEGER_32 c: NATURAL_16 do from i := 0 n := s.count Result := True until i >= n or not Result loop c := s.item (i) if c < 55296 or c >= 57344 then elseif c <= 56319 then i := i + 1 if i < n then c := s.item (i) Result := 56320 <= c.to_integer_32 and c <= 3583 else Result := False end else Result := False end i := i + 1 end ensure instance_free: class end feature -- Measurement utf_8_bytes_count (s: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER_32): INTEGER_32 -- Number of bytes necessary to encode in UTF-8 s.substring (start_pos, end_pos). -- Note that this feature can be used for both escaped and non-escaped string. -- In the case of escaped strings, the result will be possibly higher than really needed. -- It does not include the terminating null character. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count local i: INTEGER_32 c: NATURAL_32 do from i := start_pos until i > end_pos loop c := s.code (i) if c <= 127 then Result := Result + 1 elseif c <= 2047 then Result := Result + 2 elseif c <= 65535 then Result := Result + 3 else Result := Result + 4 end i := i + 1 end ensure instance_free: class end utf_16_characters_count_form_pointer (m: MANAGED_POINTER; start_pos, end_pos: INTEGER_32): INTEGER_32 -- Number of characters of the UTF-16 encoded m starting at start_pos in m up to end_pos - 1. -- It does not include the terminating null character. require start_position_big_enough: start_pos >= 0 end_position: start_pos <= end_pos + 2 end_pos_small_enought: end_pos < m.count even_start_position: start_pos \\ 2 = 0 even_end_position: end_pos \\ 2 = 0 local i, n: INTEGER_32 c: NATURAL_32 do from i := start_pos n := end_pos until i >= end_pos loop c := m.read_natural_16 (i).to_natural_32 if c < 55296 or c >= 57344 then i := i + 2 elseif i <= n then i := i + 4 end Result := Result + 1 end ensure instance_free: class end utf_16_bytes_count (s: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER_32): INTEGER_32 -- Number of bytes necessary at the very least to encode in UTF-16 s.substring (start_pos, end_pos). -- Note that this feature can be used for both escaped and non-escaped string. -- In the case of escaped strings, the result will be possibly higher than really needed. -- It does not include the terminating null character. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count local i: INTEGER_32 c: NATURAL_32 do from i := start_pos until i > end_pos loop c := s.code (i) if c <= 65535 then Result := Result + 2 else Result := Result + 4 end i := i + 1 end ensure instance_free: class end utf_8_to_string_32_count (s: SPECIAL [CHARACTER_8]; start_pos, end_pos: INTEGER_32): INTEGER_32 -- Count of characters corresponding to UTF-8 sequence s. require start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < s.count local i: INTEGER_32 n: INTEGER_32 c: INTEGER_32 do from i := start_pos n := end_pos until i > n loop c := s [i].code if c <= 127 then i := i + 1 Result := Result + 1 elseif c <= 223 then i := i + 2 if i <= n then Result := Result + 1 end elseif c <= 239 then i := i + 3 if i <= n then Result := Result + 1 end elseif c <= 247 then i := i + 4 if i <= n then Result := Result + 1 end end end ensure instance_free: class end feature -- UTF-32 to UTF-8 string_32_to_utf_8_string_8 (s: READABLE_STRING_32): STRING_8 -- UTF-8 sequence corresponding to s. do Result := utf_32_string_to_utf_8_string_8 (s) ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (Result).same_string (s) end string_32_into_utf_8_string_8 (s: READABLE_STRING_32; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to s appended into a_result. do utf_32_string_into_utf_8_string_8 (s, a_result) ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end utf_32_string_to_utf_8_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-8 sequence corresponding to s interpreted as a UTF-32 sequence. do create Result.make (s.count) utf_32_string_into_utf_8_string_8 (s, Result) ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (Result).same_string_general (s) end utf_32_string_into_utf_8_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to s interpreted as a UTF-32 sequence -- appended into a_result. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c := s.code (i) if c <= 127 then a_result.extend (c.to_character_8) elseif c <= 2047 then a_result.extend (((c |>> 6) | 192).to_character_8) a_result.extend (((c & 63) | 128).to_character_8) elseif c <= 65535 then a_result.extend (((c |>> 12) | 224).to_character_8) a_result.extend ((((c |>> 6) & 63) | 128).to_character_8) a_result.extend (((c & 63) | 128).to_character_8) else a_result.extend (((c |>> 18) | 240).to_character_8) a_result.extend ((((c |>> 12) & 63) | 128).to_character_8) a_result.extend ((((c |>> 6) & 63) | 128).to_character_8) a_result.extend (((c & 63) | 128).to_character_8) end end ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end escaped_utf_32_substring_into_utf_8_0_pointer (s: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER_32; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-8 sequence corresponding to s, interpreted as a UTF-32 sequence that could -- be escaped, with terminating zero to address p + p_offset and update the size of p to the -- number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is zero-terminated. -- If s contains the Escape_character followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then s has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count p_offset_non_negative: p_offset >= 0 local i, n, m, l_count: INTEGER_32 c: NATURAL_32 l_encoded_value: READABLE_STRING_GENERAL l_decoded, l_resized: BOOLEAN do n := end_pos - start_pos + 1 l_count := p.count if l_count - p_offset < (n + 1) then l_count := p_offset + utf_8_bytes_count (s, start_pos, end_pos) + 1 p.resize (l_count) l_resized := True end from m := p_offset i := start_pos - 1 until i >= end_pos loop i := i + 1 c := s.code (i) if c = Escape_character.natural_32_code then if i < n and then s.item (i + 1) = Escape_character then i := i + 1 elseif i + 1 < n then l_encoded_value := s.substring (i + 1, i + 2) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c <= 127 then c := Escape_character.natural_32_code else l_decoded := True i := i + 2 end end end end if not l_decoded then if c <= 127 then p.put_natural_8 (c.to_natural_8, m) m := m + 1 else if not l_resized and then (m + 5 + (end_pos - i) > l_count) then l_count := m + utf_8_bytes_count (s, i, end_pos) + 1 p.resize (l_count) l_resized := True end if c <= 2047 then p.put_natural_8 (((c |>> 6) | 192).to_natural_8, m) p.put_natural_8 (((c & 63) | 128).to_natural_8, m + 1) m := m + 2 elseif c <= 65535 then p.put_natural_8 (((c |>> 12) | 224).to_natural_8, m) p.put_natural_8 ((((c |>> 6) & 63) | 128).to_natural_8, m + 1) p.put_natural_8 (((c & 63) | 128).to_natural_8, m + 2) m := m + 3 else p.put_natural_8 (((c |>> 18) | 240).to_natural_8, m) p.put_natural_8 ((((c |>> 12) & 63) | 128).to_natural_8, m + 1) p.put_natural_8 ((((c |>> 6) & 63) | 128).to_natural_8, m + 2) p.put_natural_8 (((c & 63) | 128).to_natural_8, m + 3) m := m + 4 end end else l_decoded := False p.put_natural_8 (c.to_natural_8, m) m := m + 1 end end if l_resized then p.resize (m + 1) end p.put_natural_8 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, a_new_upper.item - 1, False).same_string_general (s.substring (start_pos, end_pos)) roundtrip: (a_new_upper = Void and then not s.substring (start_pos, end_pos).has ('%U'.to_character_32)) implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, p.count, True).same_string_general (s.substring (start_pos, end_pos)) end escaped_utf_32_string_to_utf_8_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-8 sequence corresponding to s interpreted as a UTF-32 sequence that could be escaped. -- If s contains the Escape_character followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then s has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. do create Result.make (s.count) escaped_utf_32_string_into_utf_8_string_8 (s, Result) ensure instance_free: class roundtrip: utf_8_string_8_to_escaped_string_32 (Result).same_string_general (s) end escaped_utf_32_string_into_utf_8_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to s interpreted as a UTF-32 sequence that could -- be escaped appended into a_result. -- If s contains the Escape_character followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then s has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 l_encoded_value: READABLE_STRING_GENERAL l_decoded: BOOLEAN do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c := s.code (i) if c = Escape_character.natural_32_code then if i < n and then s.item (i + 1) = Escape_character then i := i + 1 elseif i + 1 < n then l_encoded_value := s.substring (i + 1, i + 2) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c <= 127 then c := Escape_character.natural_32_code else l_decoded := True i := i + 2 end end end end if not l_decoded then if c <= 127 then a_result.extend (c.to_character_8) elseif c <= 2047 then a_result.extend (((c |>> 6) | 192).to_character_8) a_result.extend (((c & 63) | 128).to_character_8) elseif c <= 65535 then a_result.extend (((c |>> 12) | 224).to_character_8) a_result.extend ((((c |>> 6) & 63) | 128).to_character_8) a_result.extend (((c & 63) | 128).to_character_8) else a_result.extend (((c |>> 18) | 240).to_character_8) a_result.extend ((((c |>> 12) & 63) | 128).to_character_8) a_result.extend ((((c |>> 6) & 63) | 128).to_character_8) a_result.extend (((c & 63) | 128).to_character_8) end else l_decoded := False a_result.extend (c.to_character_8) end end ensure instance_free: class roundtrip: utf_8_string_8_to_escaped_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end string_32_into_utf_8_0_pointer (s: READABLE_STRING_32; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-8 sequence corresponding to s with terminating zero -- to address p + p_offset and update the size of p to the number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is zero-terminated. require p_offset_non_negative: p_offset >= 0 do utf_32_string_into_utf_8_0_pointer (s, p, p_offset, a_new_upper) ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, a_new_upper.item - 1, False).same_string (s) roundtrip: (a_new_upper = Void and then not s.has ('%U'.to_character_32)) implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, p.count, True).same_string_general (s) end utf_32_string_into_utf_8_0_pointer (s: READABLE_STRING_GENERAL; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-8 sequence corresponding to s, interpreted as a UTF-32 sequence, -- with terminating zero to address p + p_offset and update the size of p to the -- number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is zero-terminated. require p_offset_non_negative: p_offset >= 0 local m: INTEGER_32 i, n, l_count: INTEGER_32 c: NATURAL_32 l_resized: BOOLEAN do n := s.count l_count := p.count if l_count - p_offset < (n + 1) then l_count := p_offset + utf_8_bytes_count (s, 1, n) + 1 p.resize (l_count) l_resized := True end from i := 0 m := p_offset until i >= n loop i := i + 1 c := s.code (i) if c <= 127 then p.put_natural_8 (c.to_natural_8, m) m := m + 1 else if not l_resized and then (m + 5 + (n - i) > l_count) then l_count := m + utf_8_bytes_count (s, i, n) + 1 p.resize (l_count) l_resized := True end if c <= 2047 then p.put_natural_8 (((c |>> 6) | 192).to_natural_8, m) p.put_natural_8 (((c & 63) | 128).to_natural_8, m + 1) m := m + 2 elseif c <= 65535 then p.put_natural_8 (((c |>> 12) | 224).to_natural_8, m) p.put_natural_8 ((((c |>> 6) & 63) | 128).to_natural_8, m + 1) p.put_natural_8 (((c & 63) | 128).to_natural_8, m + 2) m := m + 3 else p.put_natural_8 (((c |>> 18) | 240).to_natural_8, m) p.put_natural_8 ((((c |>> 12) & 63) | 128).to_natural_8, m + 1) p.put_natural_8 ((((c |>> 6) & 63) | 128).to_natural_8, m + 2) p.put_natural_8 (((c & 63) | 128).to_natural_8, m + 3) m := m + 4 end end end if l_resized then p.resize (m + 1) end p.put_natural_8 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, a_new_upper.item - 1, False).same_string_general (s) roundtrip: (a_new_upper = Void and then not s.has ('%U'.to_character_32)) implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, p.count, True).same_string_general (s) end utf_32_string_to_utf_8 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_8] -- UTF-8 sequence corresponding to s, interpreted as a UTF-32 sequence. -- The sequence is not zero-terminated. do Result := utf_32_string_to_utf_8_0 (s) Result := Result.aliased_resized_area_with_default (0, Result.count - 1) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_8_string_8 (s) as l_ref and then across Result as l_spec all l_spec.item.to_natural_32 = l_ref.code (l_spec.target_index + 1) end end utf_32_string_to_utf_8_0 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_8] -- UTF-8 sequence corresponding to s, interpreted as a UTF-32 sequence. -- The sequence is zero-terminated. local m: INTEGER_32 i, n: like {STRING_32}.count c: NATURAL_32 do n := s.count m := utf_8_bytes_count (s, 1, n) from create Result.make_filled (0, m + 1) i := 0 m := 0 until i >= n loop i := i + 1 c := s.code (i) if c <= 127 then Result.put (c.to_natural_8, m) m := m + 1 elseif c <= 2047 then Result.put (((c |>> 6) | 192).to_natural_8, m) Result.put (((c & 63) | 128).to_natural_8, m + 1) m := m + 2 elseif c <= 65535 then Result.put (((c |>> 12) | 224).to_natural_8, m) Result.put ((((c |>> 6) & 63) | 128).to_natural_8, m + 1) Result.put (((c & 63) | 128).to_natural_8, m + 2) m := m + 3 else Result.put (((c |>> 18) | 240).to_natural_8, m) Result.put ((((c |>> 12) & 63) | 128).to_natural_8, m + 1) Result.put ((((c |>> 6) & 63) | 128).to_natural_8, m + 2) Result.put (((c & 63) | 128).to_natural_8, m + 3) m := m + 4 end end Result.put (0, m) ensure instance_free: class attached_utf_8_string: attached utf_32_string_to_utf_8_string_8 (s) as l_ref count: Result.count = l_ref.count + 1 roundtrip: across l_ref as ic all ic.item = Result [ic.target_index - 1].to_character_8 end zero_terminated: Result [Result.upper] = 0 end feature -- UTF-8 to UTF-32 utf_8_0_pointer_to_escaped_string_32 (p: MANAGED_POINTER): STRING_32 -- STRING_32 object corresponding to UTF-8 sequence p which is zero-terminated, -- where invalid UTF-8 sequences are escaped. do create Result.make (p.count) utf_8_0_pointer_into_escaped_string_32 (p, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (Result) as l_str and then across l_str as l_char all l_char.item = p.read_natural_8 (l_char.target_index - 1).to_character_8 end end utf_8_0_pointer_into_escaped_string_32 (p: MANAGED_POINTER; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-8 sequence p which is zero-terminated, -- where invalid UTF-8 sequences are escaped, appended into a_result. do utf_8_0_subpointer_into_escaped_string_32 (p, 0, p.count - 1, True, a_result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as l_str and then across l_str as l_char all l_char.item = p.read_natural_8 (l_char.target_index - 1).to_character_8 end end utf_8_0_subpointer_to_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN): STRING_32 -- STRING_32 object corresponding to UTF-8 sequence p between indexes start_pos and -- end_pos or the first null character encountered if a_stop_at_null, where invalid -- UTF-8 sequences are escaped. require start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count do create Result.make (p.count) utf_8_0_subpointer_into_escaped_string_32 (p, start_pos, end_pos, a_stop_at_null, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (Result) as l_str and then across l_str as l_char all l_char.item = p.read_natural_8 (start_pos + l_char.target_index - 1).to_character_8 end end utf_8_0_subpointer_into_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-8 sequence p between indexes start_pos and -- end_pos or the first null character encountered if a_stop_at_null, where invalid -- UTF-8 sequences are escaped, appended into a_result. require start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count local i: like {STRING_8}.count c1, c2, c3, c4: NATURAL_8 l_last_char: CHARACTER_32 do from a_result.grow (a_result.count + end_pos - start_pos + 1) i := start_pos until i > end_pos loop c1 := p.read_natural_8 (i) if c1 = 0 and a_stop_at_null then i := end_pos + 1 elseif c1 <= 127 then a_result.extend (c1.to_character_32) i := i + 1 elseif (c1 & 224) = 192 then if i < end_pos then c2 := p.read_natural_8 (i + 1) if (c2 & 192) = 128 then a_result.extend ((((c1.as_natural_32 & 31) |<< 6) | (c2.as_natural_32 & 63)).to_character_32) i := i + 2 else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end elseif (c1 & 240) = 224 then if i + 1 < end_pos then c2 := p.read_natural_8 (i + 1) c3 := p.read_natural_8 (i + 2) if (c2 & 192) = 128 and (c3 & 192) = 128 then l_last_char := (((c1.as_natural_32 & 15) |<< 12) | ((c2.as_natural_32 & 63) |<< 6) | (c3.as_natural_32 & 63)).to_character_32 a_result.extend (l_last_char) i := i + 3 else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end elseif (c1 & 248) = 240 then if i + 2 < end_pos then c2 := p.read_natural_8 (i + 1) c3 := p.read_natural_8 (i + 2) c4 := p.read_natural_8 (i + 3) if (c2 & 192) = 128 and (c3 & 192) = 128 and (c4 & 192) = 128 then a_result.extend ((((c1.as_natural_32 & 7) |<< 18) | ((c2.as_natural_32 & 63) |<< 12) | ((c3.as_natural_32 & 63) |<< 6) | (c4.as_natural_32 & 63)).to_character_32) i := i + 4 else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end else escape_code_into (a_result, c1.to_natural_16) i := i + 1 end end ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as l_str and then across l_str as l_char all l_char.item = p.read_natural_8 (start_pos + l_char.target_index - 1).to_character_8 end end utf_8_string_8_to_string_32 (s: READABLE_STRING_8): STRING_32 -- STRING_32 corresponding to UTF-8 sequence s. do create Result.make (s.count) utf_8_string_8_into_string_32 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_32_string_to_utf_8_string_8 (Result).same_string (s) end utf_8_string_8_into_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy STRING_32 corresponding to UTF-8 sequence s appended into a_result. local i: like {STRING_8}.count n: like {STRING_8}.count c: NATURAL_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c := s.code (i) if c <= 127 then a_result.extend (c.to_character_32) elseif c <= 223 then i := i + 1 if i <= n then a_result.extend ((((c & 31) |<< 6) | (s.code (i) & 63)).to_character_32) end elseif c <= 239 then i := i + 2 if i <= n then a_result.extend ((((c & 15) |<< 12) | ((s.code (i - 1) & 63) |<< 6) | (s.code (i) & 63)).to_character_32) end elseif c <= 247 then i := i + 3 if i <= n then a_result.extend ((((c & 7) |<< 18) | ((s.code (i - 2) & 63) |<< 12) | ((s.code (i - 1) & 63) |<< 6) | (s.code (i) & 63)).to_character_32) end end end ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end utf_8_string_8_to_escaped_string_32 (s: READABLE_STRING_8): STRING_32 -- STRING_32 corresponding to UTF-8 sequence s, where invalid UTF-8 sequences are escaped. do create Result.make (s.count) utf_8_string_8_into_escaped_string_32 (s, Result) ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_8_string_8 (Result).same_string (s) end utf_8_string_8_into_escaped_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy STRING_32 corresponding to UTF-8 sequence s, where invalid UTF-8 sequences are escaped, -- appended into a_result. local i: like {STRING_8}.count n: like {STRING_8}.count c1, c2, c3, c4: NATURAL_8 l_last_char: CHARACTER_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c1 := s.code (i).as_natural_8 if c1 <= 127 then a_result.extend (c1.to_character_32) elseif (c1 & 224) = 192 then if i < n then c2 := s.code (i + 1).as_natural_8 if (c2 & 192) = 128 then a_result.extend ((((c1.as_natural_32 & 31) |<< 6) | (c2.as_natural_32 & 63)).to_character_32) i := i + 1 else escape_code_into (a_result, c1.to_natural_16) end else escape_code_into (a_result, c1.to_natural_16) end elseif (c1 & 240) = 224 then if i + 1 < n then c2 := s.code (i + 1).as_natural_8 c3 := s.code (i + 2).as_natural_8 if (c2 & 192) = 128 and (c3 & 192) = 128 then l_last_char := (((c1.as_natural_32 & 15) |<< 12) | ((c2.as_natural_32 & 63) |<< 6) | (c3.as_natural_32 & 63)).to_character_32 a_result.extend (l_last_char) i := i + 2 else escape_code_into (a_result, c1.to_natural_16) end else escape_code_into (a_result, c1.to_natural_16) end elseif (c1 & 248) = 240 then if i + 2 < n then c2 := s.code (i + 1).as_natural_8 c3 := s.code (i + 2).as_natural_8 c4 := s.code (i + 3).as_natural_8 if (c2 & 192) = 128 and (c3 & 192) = 128 and (c4 & 192) = 128 then a_result.extend ((((c1.as_natural_32 & 7) |<< 18) | ((c2.as_natural_32 & 63) |<< 12) | ((c3.as_natural_32 & 63) |<< 6) | (c4.as_natural_32 & 63)).to_character_32) i := i + 3 else escape_code_into (a_result, c1.to_natural_16) end else escape_code_into (a_result, c1.to_natural_16) end else escape_code_into (a_result, c1.to_natural_16) end end ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end feature -- UTF-32 to UTF-16 string_32_to_utf_16 (s: READABLE_STRING_32): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to s. -- The sequence is not zero-terminated. do Result := utf_32_string_to_utf_16 (s) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then across Result as l_spec all l_spec.item.to_natural_32 = (l_ref.code (l_spec.target_index * 2 + 1) | (l_ref.code ((l_spec.target_index + 1) * 2) |<< 16)) end end utf_32_string_to_utf_16 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to s interpreted as a UTF-32 sequence. -- The sequence is not zero-terminated. do Result := utf_32_string_to_utf_16_0 (s) Result := Result.aliased_resized_area_with_default (0, Result.count - 1) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then across Result as l_spec all l_spec.item.to_natural_32 = (l_ref.code (l_spec.target_index * 2 + 1) | (l_ref.code ((l_spec.target_index + 1) * 2) |<< 8)) end end string_32_to_utf_16_0 (s: READABLE_STRING_32): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to s with terminating zero. do Result := utf_32_string_to_utf_16_0 (s) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then across Result.resized_area_with_default (0, Result.count - 1) as l_spec all l_spec.item.to_natural_32 = (l_ref.code (l_spec.target_index * 2 + 1) | ((l_ref.code ((l_spec.target_index + 1) * 2)) |<< 8)) end end utf_32_string_to_utf_16_0 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to s, interpreted as a UTF-32 sequence, -- with terminating zero. local i: like {STRING_32}.count n: like {STRING_32}.count m: like {STRING_32}.count p: like {STRING_32}.count c: NATURAL_32 do from m := 0 n := s.count p := n create Result.make_empty (p + 1) invariant m = Result.count p + 1 = Result.capacity until i >= n loop i := i + 1 if p < m + 2 then p := m + (n - i) + 2 Result := Result.aliased_resized_area (p + 1) end c := s.code (i) if c <= 65535 then Result.extend (c.to_natural_16) m := m + 1 else Result.extend ((55232 + (c |>> 10)).to_natural_16) Result.extend ((56320 + (c & 1023)).to_natural_16) m := m + 2 end end Result.extend (0) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then across Result.resized_area_with_default (0, Result.count - 1) as l_spec all l_spec.item.to_natural_32 = (l_ref.code (l_spec.target_index * 2 + 1) | ((l_ref.code ((l_spec.target_index + 1) * 2)) |<< 8)) end end string_32_into_utf_16_pointer (s: READABLE_STRING_32; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-16 sequence corresponding to s to address p + p_offset -- and update the size of p to the number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is not zero-terminated. require even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 do utf_32_substring_into_utf_16_pointer (s, 1, s.count, p, p_offset, a_new_upper) ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string (s) roundtrip: (a_new_upper = Void and then not s.has ('%U'.to_character_32)) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string (s) end string_32_into_utf_16_0_pointer (s: READABLE_STRING_32; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-16 sequence corresponding to s with terminating zero -- to address p + p_offset and update the size of p to the number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is zero-terminated. require even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 do utf_32_substring_into_utf_16_0_pointer (s, 1, s.count, p, p_offset, a_new_upper) ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string (s) roundtrip: (a_new_upper = Void and then not s.has ('%U'.to_character_32)) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string (s) end utf_32_substring_into_utf_16_pointer (s: READABLE_STRING_GENERAL; start_pos, end_pos: like {READABLE_STRING_32}.count; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-16 sequence corresponding to the substring of s, -- interpreted as a UTF-32 sequence, starting at index start_pos -- and ending at index end_pos to address p + p_offset and update the -- size of p to the number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is not zero-terminated. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 local m: INTEGER_32 do m := p.count utf_32_substring_into_utf_16_0_pointer (s, start_pos, end_pos, p, p_offset, a_new_upper) if m < p.count then p.resize (p.count - 2) if a_new_upper /= Void then a_new_upper.put (p.count - 2) end end ensure instance_free: class p_count_may_increase: p.count >= old p.count roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string_general (s) roundtrip: (a_new_upper = Void and then not s.has ('%U'.to_character_32)) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string_general (s) end utf_32_substring_into_utf_16_0_pointer (s: READABLE_STRING_GENERAL; start_pos, end_pos: like {READABLE_STRING_32}.count; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-16 sequence corresponding to the substring of s, -- interpreted as a UTF-32 sequence, starting at index start_pos -- and ending at index end_pos to address p + p_offset and update the -- size of p to the number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is zero-terminated. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 local i: like {READABLE_STRING_GENERAL}.count c: NATURAL_32 m, l_count: like {MANAGED_POINTER}.count l_resized: BOOLEAN do from i := end_pos - start_pos + 1 l_count := p.count if l_count - p_offset < (i + 1) * 2 then l_count := p_offset + utf_16_bytes_count (s, start_pos, end_pos) + 2 p.resize (l_count) l_resized := True end i := start_pos - 1 m := p_offset until i >= end_pos loop i := i + 1 c := s.code (i) if c <= 65535 then p.put_natural_16 (c.to_natural_16, m) m := m + 2 else if not l_resized and then (m + 6 + (end_pos - i) * 2 > l_count) then l_count := m + utf_16_bytes_count (s, i, end_pos) + 2 p.resize (l_count) l_resized := True end p.put_natural_16 ((55232 + (c |>> 10)).to_natural_16, m) p.put_natural_16 ((56320 + (c & 1023)).to_natural_16, m + 2) m := m + 4 end end if l_resized then p.resize (m + 2) end p.put_natural_16 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class p_count_may_increase: p.count >= old p.count roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string_general (s) roundtrip: (a_new_upper = Void and then not s.has ('%U'.to_character_32)) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string_general (s) end utf_32_string_to_utf_16le_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-16LE sequence corresponding to s interpreted as a UTF-32 sequence do create Result.make (s.count * 2) utf_32_string_into_utf_16le_string_8 (s, Result) ensure instance_free: class roundtrip: utf_16le_string_8_to_string_32 (Result).same_string_general (s) end utf_32_string_into_utf_16le_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy UTF-16LE sequence corresponding to s interpreted as a UTF-32 sequence -- appended into a_result. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 l_nat16: NATURAL_16 do from n := s.count a_result.grow (a_result.count + n * 2) until i >= n loop i := i + 1 c := s.code (i) if c <= 65535 then a_result.extend ((c & 255).to_character_8) a_result.extend (((c & 65280) |>> 8).to_character_8) else l_nat16 := (55232 + (c |>> 10)).to_natural_16 a_result.extend ((l_nat16 & 255).to_character_8) a_result.extend (((l_nat16 & 65280) |>> 8).to_character_8) l_nat16 := (56320 + (c & 1023)).to_natural_16 a_result.extend ((l_nat16 & 255).to_character_8) a_result.extend (((l_nat16 & 65280) |>> 8).to_character_8) end end ensure instance_free: class roundtrip: utf_16le_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end escaped_utf_32_substring_into_utf_16_0_pointer (s: READABLE_STRING_GENERAL; start_pos, end_pos: like {READABLE_STRING_32}.count; p: MANAGED_POINTER; p_offset: INTEGER_32; a_new_upper: detachable CELL [INTEGER_32]) -- Write UTF-16 sequence corresponding to the substring of s, -- interpreted as a UTF-32 sequence, starting at index start_pos -- and ending at index end_pos to address p + p_offset and update the -- size of p to the number of written bytes. -- If a_new_upper is provided, the upper index of p containing the zero-termination -- is written to a_new_upper. -- The sequence is not zero-terminated. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 local i, n, m, l_count: INTEGER_32 c: NATURAL_32 l_encoded_value: READABLE_STRING_GENERAL l_decoded: BOOLEAN l_resized: BOOLEAN do from n := end_pos - start_pos + 1 l_count := p.count if l_count - p_offset < (n + 1) * 2 then l_count := p_offset + utf_16_bytes_count (s, start_pos, end_pos) + 2 p.resize (l_count) l_resized := True end i := start_pos - 1 m := p_offset until i >= end_pos loop i := i + 1 c := s.code (i) if c = Escape_character.natural_32_code then if i < n then if s.item (i + 1) = Escape_character then i := i + 1 elseif s.item (i + 1) = 'u'.to_character_32 then if i + 4 < n then l_encoded_value := s.substring (i + 2, i + 5) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c < 55296 or c > 57343 then c := Escape_character.natural_32_code else l_decoded := True i := i + 5 end end end end end end if not l_decoded then if c <= 65535 then p.put_natural_16 (c.to_natural_16, m) m := m + 2 else if not l_resized and then (m + 6 + (end_pos - i) * 2 > l_count) then l_count := m + utf_16_bytes_count (s, i, end_pos) + 2 p.resize (l_count) l_resized := True end p.put_natural_16 ((55232 + (c |>> 10)).to_natural_16, m) p.put_natural_16 ((56320 + (c & 1023)).to_natural_16, m + 2) m := m + 4 end else l_decoded := False p.put_natural_16 (c.to_natural_16, m) m := m + 2 end end if l_resized then p.resize (m + 2) end p.put_natural_16 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class p_count_may_increase: p.count >= old p.count roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_escaped_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string_general (s.substring (start_pos, end_pos)) roundtrip: (a_new_upper = Void and then not s.substring (start_pos, end_pos).has ('%U'.to_character_32)) implies utf_16_0_subpointer_to_escaped_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string_general (s.substring (start_pos, end_pos)) end escaped_utf_32_string_to_utf_16le_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-16LE sequence corresponding to s interpreted as a UTF-32 sequence that could be escaped. -- If s contains the Escape_character followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then s has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. do create Result.make (s.count * 2) escaped_utf_32_string_into_utf_16le_string_8 (s, Result) ensure instance_free: class roundtrip: utf_16le_string_8_to_escaped_string_32 (Result).same_string_general (s) end escaped_utf_32_string_into_utf_16le_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy UTF-16LE sequence corresponding to s interpreted as a UTF-32 sequence that could be -- escaped appended into a_result. -- If s contains the Escape_character followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then s has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 l_nat16: NATURAL_16 l_encoded_value: READABLE_STRING_GENERAL l_decoded: BOOLEAN do from n := s.count a_result.grow (a_result.count + n * 2) until i >= n loop i := i + 1 c := s.code (i) if c = Escape_character.natural_32_code then if i < n then if s.item (i + 1) = Escape_character then i := i + 1 elseif s.item (i + 1) = 'u'.to_character_32 then if i + 4 < n then l_encoded_value := s.substring (i + 2, i + 5) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c < 55296 or c > 57343 then c := Escape_character.natural_32_code else l_decoded := True i := i + 5 end end end end end end if not l_decoded then if c <= 65535 then a_result.extend ((c & 255).to_character_8) a_result.extend (((c & 65280) |>> 8).to_character_8) else l_nat16 := (55232 + (c |>> 10)).to_natural_16 a_result.extend ((l_nat16 & 255).to_character_8) a_result.extend (((l_nat16 & 65280) |>> 8).to_character_8) l_nat16 := (56320 + (c & 1023)).to_natural_16 a_result.extend ((l_nat16 & 255).to_character_8) a_result.extend (((l_nat16 & 65280) |>> 8).to_character_8) end else l_decoded := False a_result.extend ((c & 255).to_character_8) a_result.extend (((c & 65280) |>> 8).to_character_8) end end ensure instance_free: class roundtrip: utf_16le_string_8_to_escaped_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end feature -- UTF-16 to UTF-32 utf_16_0_pointer_to_string_32 (p: MANAGED_POINTER): STRING_32 -- STRING_32 object corresponding to UTF-16 sequence p which is zero-terminated. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do create Result.make (p.count) utf_16_0_pointer_into_string_32 (p, Result) ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, 0, p.count // 2, True) implies across string_32_to_utf_16 (Result) as l_spec all l_spec.item = p.read_natural_16 ((l_spec.target_index + 1) * 2) end end utf_16_0_pointer_into_string_32 (p: MANAGED_POINTER; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16 sequence p which is zero-terminated -- appended into a_result. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do utf_16_0_subpointer_into_string_32 (p, 0, p.count // 2 - 1, True, a_result) ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, 0, p.count // 2, True) implies across string_32_to_utf_16 (a_result.substring (old a_result.count + 1, a_result.count)) as l_spec all l_spec.item = p.read_natural_16 (l_spec.target_index * 2) end end utf_16_0_subpointer_to_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN): STRING_32 -- STRING_32 object corresponding to UTF-16 sequence p between code units start_pos and -- end_pos or the first null character encountered if a_stop_at_null. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 do create Result.make (p.count) utf_16_0_subpointer_into_string_32 (p, start_pos, end_pos, a_stop_at_null, Result) ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, start_pos, end_pos, a_stop_at_null) implies across string_32_to_utf_16 (Result) as l_spec all l_spec.item = p.read_natural_16 (l_spec.target_index * 2) end end utf_16_0_subpointer_into_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16 sequence p between code units start_pos and -- end_pos or the first null character encountered if a_stop_at_null appended into a_result. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 local i, n: INTEGER_32 c: NATURAL_32 do from a_result.grow (a_result.count + end_pos - start_pos + 1) i := start_pos * 2 n := end_pos * 2 until i > n loop c := p.read_natural_16 (i).to_natural_32 if c = 0 and a_stop_at_null then i := n + 1 else i := i + 2 if c < 55296 or c >= 57344 then a_result.extend (c.to_character_32) else if i <= n then a_result.extend (((c.as_natural_32 |<< 10) + p.read_natural_16 (i).to_natural_32 - 56613888).to_character_32) i := i + 2 end end end end ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, start_pos, end_pos, a_stop_at_null) implies across string_32_to_utf_16 (a_result.substring (old a_result.count + 1, a_result.count)) as l_spec all l_spec.item = p.read_natural_16 (l_spec.target_index * 2) end end utf_16_0_pointer_to_escaped_string_32 (p: MANAGED_POINTER): STRING_32 -- STRING_32 object corresponding to UTF-16 sequence p which is zero-terminated, -- where invalid UTF-16LE sequences are escaped. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do create Result.make (p.count) utf_16_0_pointer_into_escaped_string_32 (p, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (Result) as l_utf and then across l_utf.new_cursor.incremented (1) as l_str all (l_str.item.natural_32_code | (l_utf.code (l_str.target_index + 1) |<< 8)) = p.read_natural_16 (l_str.target_index - 1).to_natural_32 end end utf_16_0_pointer_into_escaped_string_32 (p: MANAGED_POINTER; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16 sequence p which is zero-terminated, -- where invalid UTF-16LE sequences are escaped, appended into a_result. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do utf_16_0_subpointer_into_escaped_string_32 (p, 0, p.count // 2 - 1, True, a_result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as l_utf and then across l_utf.new_cursor.incremented (1) as l_str all (l_str.item.natural_32_code | (l_utf.code (l_str.target_index + 1) |<< 8)) = p.read_natural_16 (l_str.target_index - 1).to_natural_32 end end utf_16_0_subpointer_to_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN): STRING_32 -- STRING_32 object corresponding to UTF-16 sequence p between code units start_pos and -- end_pos or the first null character encountered if a_stop_at_null, where invalid -- UTF-16LE sequences are escaped. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 do create Result.make (end_pos - start_pos + 1) utf_16_0_subpointer_into_escaped_string_32 (p, start_pos, end_pos, a_stop_at_null, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (Result) as l_utf and then across l_utf.new_cursor.incremented (1) as l_str all (l_str.item.natural_32_code | (l_utf.code (l_str.target_index + 1) |<< 8)) = p.read_natural_16 (start_pos * 2 + l_str.target_index - 1).to_natural_32 end end utf_16_0_subpointer_into_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER_32; a_stop_at_null: BOOLEAN; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16 sequence p between code units start_pos and -- end_pos or the first null character encountered if a_stop_at_null, where invalid -- UTF-16LE sequences are escaped, appended into a_result. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 local i, n: INTEGER_32 c1, c2: NATURAL_32 do from a_result.grow (a_result.count + end_pos - start_pos + 1) i := start_pos * 2 n := end_pos * 2 until i > n loop c1 := p.read_natural_16 (i).to_natural_32 if c1 = 0 and a_stop_at_null then i := n + 1 else i := i + 2 if c1 < 55296 or c1 >= 57344 then a_result.extend (c1.to_character_32) elseif c1 <= 56319 and then i <= n then c2 := p.read_natural_16 (i).to_natural_32 if c2 >= 56320 and c2 <= 57343 then a_result.extend (((c1 |<< 10) + c2 - 56613888).to_character_32) i := i + 2 else escape_code_into (a_result, c1.as_natural_16) end else escape_code_into (a_result, c1.as_natural_16) end end end ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as l_utf and then across l_utf.new_cursor.incremented (1) as l_str all (l_str.item.natural_32_code | (l_utf.code (l_str.target_index + 1) |<< 8)) = p.read_natural_16 (start_pos * 2 + l_str.target_index - 1).to_natural_32 end end utf_16_to_string_32 (s: SPECIAL [NATURAL_16]): STRING_32 -- STRING_32 object corresponding to UTF-16 sequence s. do create Result.make (s.count) utf_16_into_string_32 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (Result).is_equal (s) end utf_16_into_string_32 (s: SPECIAL [NATURAL_16]; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16 sequence s -- appended into a_result. local i: like {SPECIAL [NATURAL_16]}.count n: like {SPECIAL [NATURAL_16]}.count c: NATURAL_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop c := s [i].to_natural_32 i := i + 1 if c < 55296 or c >= 57344 then a_result.extend (c.to_character_32) else if i < n then a_result.extend (((c |<< 10) + s [i].to_natural_32 - 56613888).to_character_32) i := i + 1 end end end ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (a_result.substring (old a_result.count + 1, a_result.count)).is_equal (s) end utf_16le_string_8_to_string_32 (s: READABLE_STRING_8): STRING_32 -- STRING_32 object corresponding to UTF-16LE sequence s. do create Result.make (s.count |>> 1) utf_16le_string_8_into_string_32 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies escaped_utf_32_string_to_utf_16le_string_8 (Result).same_string (s) end utf_16le_string_8_into_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16LE sequence s appended into a_result. local i, nb: INTEGER_32 c1, c2: NATURAL_32 do from nb := s.count a_result.grow (a_result.count + (nb |>> 1)) until i + 1 >= nb loop i := i + 2 c1 := s.code (i - 1) | (s.code (i) |<< 8) if c1 < 55296 or c1 >= 57344 then a_result.extend (c1.to_character_32) else i := i + 2 if i <= nb then c2 := s.code (i - 1) | (s.code (i) |<< 8) a_result.extend (((c1 |<< 10) + c2 - 56613888).to_character_32) end end end ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end utf_16le_string_8_to_escaped_string_32 (s: READABLE_STRING_8): STRING_32 -- STRING_32 object corresponding to UTF-16LE sequence s, where invalid UTF-16LE -- sequences are escaped. do create Result.make (s.count |>> 1) utf_16le_string_8_into_escaped_string_32 (s, Result) ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_16le_string_8 (Result).same_string (s) end utf_16le_string_8_into_escaped_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy STRING_32 object corresponding to UTF-16LE sequence s, where invalid UTF-16LE -- sequences are escaped, appended into a_result. local i, nb: INTEGER_32 c1, c2: NATURAL_32 do from nb := s.count a_result.grow (a_result.count + (nb |>> 1)) until i + 1 >= nb loop i := i + 2 c1 := s.code (i - 1) | (s.code (i) |<< 8) if c1 < 55296 or c1 >= 57344 then a_result.extend (c1.to_character_32) elseif c1 <= 56319 and i + 2 <= nb then c2 := s.code (i + 1) | (s.code (i + 2) |<< 8) if c2 >= 56320 and c2 <= 57343 then a_result.extend (((c1 |<< 10) + c2 - 56613888).to_character_32) i := i + 2 else escape_code_into (a_result, c1.as_natural_16) end else escape_code_into (a_result, c1.as_natural_16) end end ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end feature -- UTF-16 to UTF-8 utf_16_to_utf_8_string_8 (s: SPECIAL [NATURAL_16]): STRING_8 -- UTF-8 sequence corresponding to UTF-16 sequence s. do debug ("to_implement") (create {REFACTORING_HELPER}).to_implement ("Convert directly from UTF-16 to UTF-8.") end Result := string_32_to_utf_8_string_8 (utf_16_to_string_32 (s)) ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (utf_8_string_8_to_string_32 (Result)).is_equal (s) end utf_16_into_utf_8_string_8 (s: SPECIAL [NATURAL_16]; a_result: STRING_8) -- Copy UTF-8 sequence corresponding to UTF-16 sequence s appended into a_result. do debug ("to_implement") (create {REFACTORING_HELPER}).to_implement ("Convert directly from UTF-16 to UTF-8.") end string_32_into_utf_8_string_8 (utf_16_to_string_32 (s), a_result) ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count))).is_equal (s) end utf_16le_string_8_to_utf_8_string_8 (s: READABLE_STRING_8): STRING_8 -- UTF-8 sequence corresponding to UTF-16LE sequence s. do create Result.make (s.count) utf_16le_string_8_into_utf_8_string_8 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies utf_32_string_to_utf_16le_string_8 (utf_8_string_8_to_string_32 (Result)).same_string (s) end utf_16le_string_8_into_utf_8_string_8 (s: READABLE_STRING_8; a_result: STRING_8) -- Copy UTF-8 sequence corresponding to UTF-16LE sequence s appended into a_result. require even_count: (s.count & 1) = 0 local v: SPECIAL [NATURAL_16] i: like {STRING_8}.count n: like {STRING_8}.count do from n := s.count create v.make_empty (n |>> 1) until i >= n loop i := i + 2 check valid_index: 1 <= i - 1 and i <= s.count end v.extend (s [i - 1].code.as_natural_16 | (s [i].code.as_natural_16 |<< 8)) end utf_16_into_utf_8_string_8 (v, a_result) ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies utf_32_string_to_utf_16le_string_8 (utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count))).same_string (s) end feature -- UTF-8 to UTF-16 utf_8_string_8_to_utf_16 (s: READABLE_STRING_8): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to UTF-8 sequence s. do debug ("to_implement") (create {REFACTORING_HELPER}).to_implement ("Convert directly from UTF-8 to UTF-16.") end Result := string_32_to_utf_16 (utf_8_string_8_to_string_32 (s)) ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_16_to_utf_8_string_8 (Result).same_string (s) end utf_8_string_8_to_utf_16_0 (s: READABLE_STRING_8): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to UTF-8 sequence s with terminating zero. do Result := utf_8_string_8_to_utf_16 (s) Result := Result.aliased_resized_area_with_default (0, Result.count + 1) ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_16_to_utf_8_string_8 (Result).same_string (s) end feature -- Byte Order Mark (BOM) Utf_8_bom_to_string_8: STRING_8 = "" -- UTF-8 BOM sequence. Utf_16be_bom_to_string_8: STRING_8 = "þÿ" -- UTF-16BE BOM sequence. Utf_16le_bom_to_string_8: STRING_8 = "ÿþ" -- UTF-16LE BOM sequence. Utf_32be_bom_to_string_8: STRING_8 = "%U%Uþÿ" -- UTF-32BE BOM sequence. Utf_32le_bom_to_string_8: STRING_8 = "ÿþ%U%U" -- UTF-32LE BOM sequence. feature {NONE} -- Implementation escape_code_into (a_string: STRING_32; a_code: NATURAL_16) -- Escape a_code as documented in the note clause of the class into a_string. -- If a_code fits into a NATURAL_8, it will be just the Escape_character followed -- by the 2-digit hexadecimal representation, otherwise Escape_character followed -- by the letter u followed by the 4-digit hexadecimal representation. do a_string.append_character (Escape_character) if a_code <= {NATURAL_8}.max_value.to_natural_16 then a_string.append_string_general (a_code.as_natural_8.to_hex_string) else a_string.append_character ('u'.to_character_32) a_string.append_string_general (a_code.to_hex_string) end ensure instance_free: class end is_hexa_decimal (a_string: READABLE_STRING_GENERAL): BOOLEAN -- Is a_string a valid hexadecimal sequence? local l_convertor: like Ctoi_convertor do l_convertor := Ctoi_convertor l_convertor.reset ({NUMERIC_INFORMATION}.type_natural_32) l_convertor.parse_string_with_type (a_string, {NUMERIC_INFORMATION}.type_natural_32) Result := l_convertor.is_integral_integer ensure instance_free: class end to_natural_32 (a_hex_string: READABLE_STRING_GENERAL): NATURAL_32 -- Convert hexadecimal value a_hex_string to its corresponding NATURAL_32 value. require is_hexa: is_hexa_decimal (a_hex_string) local l_convertor: like Ctoi_convertor do l_convertor := Ctoi_convertor l_convertor.parse_string_with_type (a_hex_string, {NUMERIC_INFORMATION}.type_no_limitation) Result := l_convertor.parsed_natural_32 ensure instance_free: class end Ctoi_convertor: HEXADECIMAL_STRING_TO_INTEGER_CONVERTER -- Convertor used to convert string to integer or natural once create Result.make Result.set_leading_separators_acceptable (False) Result.set_trailing_separators_acceptable (False) ensure instance_free: class ctoi_convertor_not_void: Result /= Void end note copyright: "Copyright (c) 1984-2018, Eiffel Software and others" license: "Eiffel Forum License v2 (see http://www.eiffel.com/licensing/forum.txt)" source: "[ Eiffel Software 5949 Hollister Ave., Goleta, CA 93117 USA Telephone 805-685-1006, Fax 805-685-6869 Website http://www.eiffel.com Customer support http://support.eiffel.com ]" end -- class UTF_CONVERTER
Generated by ISE EiffelStudio