diff options
author | charlet <charlet@138bc75d-0d04-0410-961f-82ee72b054a4> | 2007-12-13 10:29:24 +0000 |
---|---|---|
committer | charlet <charlet@138bc75d-0d04-0410-961f-82ee72b054a4> | 2007-12-13 10:29:24 +0000 |
commit | c18655b9ce902c81ab97367927cf579421e6a6fb (patch) | |
tree | 43bd0ad62150abfc2319fc29f19db59d9db58ebc | |
parent | c2be248db6c5dd014b6210e4b711260ca5fbd172 (diff) | |
download | ppe42-gcc-c18655b9ce902c81ab97367927cf579421e6a6fb.tar.gz ppe42-gcc-c18655b9ce902c81ab97367927cf579421e6a6fb.zip |
2007-12-06 Robert Dewar <dewar@adacore.com>
* g-byorma.adb, g-byorma.ads, g-decstr.adb, g-decstr.ads,
g-deutst.ads, g-encstr.adb, g-encstr.ads, g-enutst.ads: New files.
* scn.adb: Implement BOM recognition
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@130849 138bc75d-0d04-0410-961f-82ee72b054a4
-rwxr-xr-x | gcc/ada/g-byorma.adb | 191 | ||||
-rwxr-xr-x | gcc/ada/g-byorma.ads | 104 | ||||
-rwxr-xr-x | gcc/ada/g-decstr.adb | 972 | ||||
-rwxr-xr-x | gcc/ada/g-decstr.ads | 163 | ||||
-rw-r--r-- | gcc/ada/g-deutst.ads | 45 | ||||
-rwxr-xr-x | gcc/ada/g-encstr.adb | 260 | ||||
-rwxr-xr-x | gcc/ada/g-encstr.ads | 111 | ||||
-rw-r--r-- | gcc/ada/g-enutst.ads | 45 | ||||
-rw-r--r-- | gcc/ada/scn.adb | 41 |
9 files changed, 1932 insertions, 0 deletions
diff --git a/gcc/ada/g-byorma.adb b/gcc/ada/g-byorma.adb new file mode 100755 index 00000000000..9cc6f08b519 --- /dev/null +++ b/gcc/ada/g-byorma.adb @@ -0,0 +1,191 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT COMPILER COMPONENTS -- +-- -- +-- G N A T . B Y T E _ O R D E R _ M A R K -- +-- -- +-- B o d y -- +-- -- +-- Copyright (C) 2006-2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +package body GNAT.Byte_Order_Mark is + + -------------- + -- Read_BOM -- + -------------- + + procedure Read_BOM + (Str : String; + Len : out Natural; + BOM : out BOM_Kind; + XML_Support : Boolean := False) + is + begin + -- UTF-16 (big-endian) + + if Str'Length >= 2 + and then Str (Str'First) = Character'Val (16#FE#) + and then Str (Str'First + 1) = Character'Val (16#FF#) + then + Len := 2; + BOM := UTF16_BE; + + -- UTF-16 (little-endian) + + elsif Str'Length >= 2 + and then Str (Str'First) = Character'Val (16#FF#) + and then Str (Str'First + 1) = Character'Val (16#FE#) + then + Len := 2; + BOM := UTF16_LE; + + -- UTF-32 (big-endian) + + elsif Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#00#) + and then Str (Str'First + 1) = Character'Val (16#00#) + and then Str (Str'First + 2) = Character'Val (16#FE#) + and then Str (Str'First + 3) = Character'Val (16#FF#) + then + Len := 4; + BOM := UTF32_BE; + + -- UTF-32 (little-endian) + + elsif Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#FF#) + and then Str (Str'First + 1) = Character'Val (16#FE#) + and then Str (Str'First + 2) = Character'Val (16#00#) + and then Str (Str'First + 3) = Character'Val (16#00#) + then + Len := 4; + BOM := UTF32_LE; + + -- UTF-8 (endian-independent) + + elsif Str'Length >= 3 + and then Str (Str'First) = Character'Val (16#EF#) + and then Str (Str'First + 1) = Character'Val (16#BB#) + and then Str (Str'First + 2) = Character'Val (16#BF#) + then + Len := 3; + BOM := UTF8_All; + + -- UCS-4 (big-endian) XML only + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#00#) + and then Str (Str'First + 1) = Character'Val (16#00#) + and then Str (Str'First + 2) = Character'Val (16#00#) + and then Str (Str'First + 3) = Character'Val (16#3C#) + then + Len := 0; + BOM := UCS4_BE; + + -- UCS-4 (little-endian) XML case + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#3C#) + and then Str (Str'First + 1) = Character'Val (16#00#) + and then Str (Str'First + 2) = Character'Val (16#00#) + and then Str (Str'First + 3) = Character'Val (16#00#) + then + Len := 0; + BOM := UCS4_LE; + + -- UCS-4 (unusual byte order 2143) XML case + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#00#) + and then Str (Str'First + 1) = Character'Val (16#00#) + and then Str (Str'First + 2) = Character'Val (16#3C#) + and then Str (Str'First + 3) = Character'Val (16#00#) + then + Len := 0; + BOM := UCS4_2143; + + -- UCS-4 (unusual byte order 3412) XML case + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#00#) + and then Str (Str'First + 1) = Character'Val (16#3C#) + and then Str (Str'First + 2) = Character'Val (16#00#) + and then Str (Str'First + 3) = Character'Val (16#00#) + then + Len := 0; + BOM := UCS4_3412; + + -- UTF-16 (big-endian) XML case + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#00#) + and then Str (Str'First + 1) = Character'Val (16#3C#) + and then Str (Str'First + 2) = Character'Val (16#00#) + and then Str (Str'First + 3) = Character'Val (16#3F#) + then + Len := 0; + BOM := UTF16_BE; + + -- UTF-32 (little-endian) XML case + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#3C#) + and then Str (Str'First + 1) = Character'Val (16#00#) + and then Str (Str'First + 2) = Character'Val (16#3F#) + and then Str (Str'First + 3) = Character'Val (16#00#) + then + Len := 0; + BOM := UTF16_LE; + + -- Unrecognized special encodings XML only + + elsif XML_Support + and then Str'Length >= 4 + and then Str (Str'First) = Character'Val (16#3C#) + and then Str (Str'First + 1) = Character'Val (16#3F#) + and then Str (Str'First + 2) = Character'Val (16#78#) + and then Str (Str'First + 3) = Character'Val (16#6D#) + then + -- Utf8, ASCII, some part of ISO8859, Shift-JIS, EUC,... + + Len := 0; + BOM := Unknown; + + -- No BOM recognized + + else + Len := 0; + BOM := Unknown; + end if; + end Read_BOM; + +end GNAT.Byte_Order_Mark; diff --git a/gcc/ada/g-byorma.ads b/gcc/ada/g-byorma.ads new file mode 100755 index 00000000000..6016f755bcc --- /dev/null +++ b/gcc/ada/g-byorma.ads @@ -0,0 +1,104 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT COMPILER COMPONENTS -- +-- -- +-- G N A T . B Y T E _ O R D E R _ M A R K -- +-- -- +-- S p e c -- +-- -- +-- Copyright (C) 2006-2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +-- This package provides a procedure for reading and interpreting the BOM +-- (byte order mark) used to publish the encoding method for a string (for +-- example, a UTF-8 encoded file in windows will start with the appropriate +-- BOM sequence to signal UTF-8 encoding. + +-- There are two cases + +-- Case 1. UTF encodings for Unicode files + +-- Here the convention is to have the first character of the file be a +-- non-breaking zero width space character (16#0000_FEFF#). For the UTF +-- encodings, the representation of this character can be used to uniquely +-- determine the encoding. Furthermore, the possibility of any confusion +-- with unencoded files is minimal, since for example the UTF-8 encoding +-- of this character looks like the sequence: + +-- LC_I_Diaeresis +-- Right_Angle_Quotation +-- Fraction_One_Half + +-- which is so unlikely to occur legitimately in normal use that it can +-- safely be ignored in most cases (for example, no legitimate Ada source +-- file could start with this sequence of characters). + +-- Case 2. Specialized XML encodings + +-- The XML standard defines a number of other possible encodings and also +-- defines standardized sequences for marking these encodings. This package +-- can also optionally handle these XML defined BOM sequences. These XML +-- cases depend on the first character of the XML file being < so that the +-- encoding of this character can be recognized. + +pragma Warnings (Off); +pragma Compiler_Unit; +pragma Warnings (On); + +package GNAT.Byte_Order_Mark is + + type BOM_Kind is + (UTF8_All, -- UTF8-encoding + UTF16_LE, -- UTF16 little-endian encoding + UTF16_BE, -- UTF16 big-endian encoding + UTF32_LE, -- UTF32 little-endian encoding + UTF32_BE, -- UTF32 big-endian encoding + + -- The following cases are for XML only + + UCS4_BE, -- UCS-4, big endian machine (1234 order) + UCS4_LE, -- UCS-4, little endian machine (4321 order) + UCS4_2143, -- UCS-4, unusual byte order (2143 order) + UCS4_3412, -- UCS-4, unusual byte order (3412 order) + + -- Value returned if no BOM recognized + + Unknown); -- Unknown, assumed to be ASCII compatible + + procedure Read_BOM + (Str : String; + Len : out Natural; + BOM : out BOM_Kind; + XML_Support : Boolean := False); + -- This is the routine to read the BOM from the start of the given string + -- Str. On return BOM is set to the appropriate BOM_Kind and Len is set to + -- its length. The caller will typically skip the first Len characters in + -- the string to ignore the BOM sequence. The special XML possibilities are + -- recognized only if flag XML_Support is set to True. Note that for the + -- XML cases, Len is always set to zero on return (not to the length of the + -- relevant sequence) since in the XML cases, the sequence recognized is + -- for the first real character in the file (<) which is not to be skipped. + +end GNAT.Byte_Order_Mark; diff --git a/gcc/ada/g-decstr.adb b/gcc/ada/g-decstr.adb new file mode 100755 index 00000000000..580ad15a4a6 --- /dev/null +++ b/gcc/ada/g-decstr.adb @@ -0,0 +1,972 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT RUN-TIME COMPONENTS -- +-- -- +-- G N A T . D E C O D E _ S T R I N G -- +-- -- +-- S p e c -- +-- -- +-- Copyright (C) 2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +-- This package provides a utility routine for converting from an encoded +-- string to a corresponding Wide_String or Wide_Wide_String value. + +with Interfaces; use Interfaces; + +with System.WCh_Cnv; use System.WCh_Cnv; +with System.WCh_Con; use System.WCh_Con; + +package body GNAT.Decode_String is + + ----------------------- + -- Local Subprograms -- + ----------------------- + + procedure Bad; + pragma No_Return (Bad); + -- Raise error for bad encoding + + procedure Past_End; + pragma No_Return (Past_End); + -- Raise error for off end of string + + --------- + -- Bad -- + --------- + + procedure Bad is + begin + raise Constraint_Error with + "bad encoding or character out of range"; + end Bad; + + --------------------------- + -- Decode_Wide_Character -- + --------------------------- + + procedure Decode_Wide_Character + (Input : String; + Ptr : in out Natural; + Result : out Wide_Character) + is + Char : Wide_Wide_Character; + begin + Decode_Wide_Wide_Character (Input, Ptr, Char); + + if Wide_Wide_Character'Pos (Char) > 16#FFFF# then + Bad; + else + Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char)); + end if; + end Decode_Wide_Character; + + ------------------------ + -- Decode_Wide_String -- + ------------------------ + + function Decode_Wide_String (S : String) return Wide_String is + Result : Wide_String (1 .. S'Length); + Length : Natural; + begin + Decode_Wide_String (S, Result, Length); + return Result (1 .. Length); + end Decode_Wide_String; + + procedure Decode_Wide_String + (S : String; + Result : out Wide_String; + Length : out Natural) + is + Ptr : Natural; + + begin + Ptr := S'First; + Length := 0; + while Ptr <= S'Last loop + if Length >= Result'Last then + Past_End; + end if; + + Length := Length + 1; + Decode_Wide_Character (S, Ptr, Result (Length)); + end loop; + end Decode_Wide_String; + + -------------------------------- + -- Decode_Wide_Wide_Character -- + -------------------------------- + + procedure Decode_Wide_Wide_Character + (Input : String; + Ptr : in out Natural; + Result : out Wide_Wide_Character) + is + C : Character; + + function In_Char return Character; + pragma Inline (In_Char); + -- Function to get one input character + + ------------- + -- In_Char -- + ------------- + + function In_Char return Character is + begin + if Ptr <= Input'Last then + Ptr := Ptr + 1; + return Input (Ptr - 1); + else + Past_End; + end if; + end In_Char; + + -- Start of processing for Decode_Wide_Wide_Character + + begin + C := In_Char; + + -- Special fast processing for UTF-8 case + + if Encoding_Method = WCEM_UTF8 then + UTF8 : declare + U : Unsigned_32; + W : Unsigned_32; + + procedure Get_UTF_Byte; + pragma Inline (Get_UTF_Byte); + -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode. + -- Reads a byte, and raises CE if the first two bits are not 10. + -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits. + + ------------------ + -- Get_UTF_Byte -- + ------------------ + + procedure Get_UTF_Byte is + begin + U := Unsigned_32 (Character'Pos (In_Char)); + + if (U and 2#11000000#) /= 2#10_000000# then + Bad; + end if; + + W := Shift_Left (W, 6) or (U and 2#00111111#); + end Get_UTF_Byte; + + -- Start of processing for UTF8 case + + begin + -- Note: for details of UTF8 encoding see RFC 3629 + + U := Unsigned_32 (Character'Pos (C)); + + -- 16#00_0000#-16#00_007F#: 0xxxxxxx + + if (U and 2#10000000#) = 2#00000000# then + Result := Wide_Wide_Character'Val (Character'Pos (C)); + + -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx + + elsif (U and 2#11100000#) = 2#110_00000# then + W := U and 2#00011111#; + Get_UTF_Byte; + Result := Wide_Wide_Character'Val (W); + + -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx + + elsif (U and 2#11110000#) = 2#1110_0000# then + W := U and 2#00001111#; + Get_UTF_Byte; + Get_UTF_Byte; + Result := Wide_Wide_Character'Val (W); + + -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + + elsif (U and 2#11111000#) = 2#11110_000# then + W := U and 2#00000111#; + + for K in 1 .. 3 loop + Get_UTF_Byte; + end loop; + + Result := Wide_Wide_Character'Val (W); + + -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx + + elsif (U and 2#11111100#) = 2#111110_00# then + W := U and 2#00000011#; + + for K in 1 .. 4 loop + Get_UTF_Byte; + end loop; + + Result := Wide_Wide_Character'Val (W); + + -- All other cases are invalid, note that this includes: + + -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx 10xxxxxx + + -- since Wide_Wide_Character does not include code values + -- greater than 16#03FF_FFFF#. + + else + Bad; + end if; + end UTF8; + + -- All encoding functions other than UTF-8 + + else + Non_UTF8 : declare + function Char_Sequence_To_UTF is + new Char_Sequence_To_UTF_32 (In_Char); + + begin + -- For brackets, must test for specific case of [ not followed by + -- quotation, where we must not call Char_Sequence_To_UTF, but + -- instead just return the bracket unchanged. + + if Encoding_Method = WCEM_Brackets + and then C = '[' + and then (Ptr > Input'Last or else Input (Ptr) /= '"') + then + Result := '['; + + -- All other cases including [" with Brackets + + else + Result := + Wide_Wide_Character'Val + (Char_Sequence_To_UTF (C, Encoding_Method)); + end if; + end Non_UTF8; + end if; + end Decode_Wide_Wide_Character; + + ----------------------------- + -- Decode_Wide_Wide_String -- + ----------------------------- + + function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is + Result : Wide_Wide_String (1 .. S'Length); + Length : Natural; + begin + Decode_Wide_Wide_String (S, Result, Length); + return Result (1 .. Length); + end Decode_Wide_Wide_String; + + procedure Decode_Wide_Wide_String + (S : String; + Result : out Wide_Wide_String; + Length : out Natural) + is + Ptr : Natural; + + begin + Ptr := S'First; + Length := 0; + while Ptr <= S'Last loop + if Length >= Result'Last then + Past_End; + end if; + + Length := Length + 1; + Decode_Wide_Wide_Character (S, Ptr, Result (Length)); + end loop; + end Decode_Wide_Wide_String; + + ------------------------- + -- Next_Wide_Character -- + ------------------------- + + procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is + begin + if Ptr < Input'First then + Past_End; + end if; + + -- Special efficient encoding for UTF-8 case + + if Encoding_Method = WCEM_UTF8 then + UTF8 : declare + U : Unsigned_32; + + procedure Getc; + pragma Inline (Getc); + -- Gets the character at Input (Ptr) and returns code in U as + -- Unsigned_32 value. On return Ptr is bumped past the character. + + procedure Skip_UTF_Byte; + pragma Inline (Skip_UTF_Byte); + -- Skips past one encoded byte which must be 2#10xxxxxx# + + ---------- + -- Getc -- + ---------- + + procedure Getc is + begin + if Ptr > Input'Last then + Past_End; + else + U := Unsigned_32 (Character'Pos (Input (Ptr))); + Ptr := Ptr + 1; + end if; + end Getc; + + ------------------- + -- Skip_UTF_Byte -- + ------------------- + + procedure Skip_UTF_Byte is + begin + Getc; + + if (U and 2#11000000#) /= 2#10_000000# then + Bad; + end if; + end Skip_UTF_Byte; + + -- Start of processing for UTF-8 case + + begin + -- 16#00_0000#-16#00_007F#: 0xxxxxxx + + Getc; + + if (U and 2#10000000#) = 2#00000000# then + return; + + -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx + + elsif (U and 2#11100000#) = 2#110_00000# then + Skip_UTF_Byte; + + -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx + + elsif (U and 2#11110000#) = 2#1110_0000# then + Skip_UTF_Byte; + Skip_UTF_Byte; + + -- Any other code is invalid, note that this includes: + + -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + + -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx + + -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx 10xxxxxx + + -- since Wide_Character does not allow codes > 16#FFFF# + + else + Bad; + end if; + end UTF8; + + -- Non-UTF-8 cass + + else + declare + Discard : Wide_Character; + begin + Decode_Wide_Character (Input, Ptr, Discard); + end; + end if; + end Next_Wide_Character; + + ------------------------------ + -- Next_Wide_Wide_Character -- + ------------------------------ + + procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is + begin + -- Special efficient encoding for UTF-8 case + + if Encoding_Method = WCEM_UTF8 then + UTF8 : declare + U : Unsigned_32; + + procedure Getc; + pragma Inline (Getc); + -- Gets the character at Input (Ptr) and returns code in U as + -- Unsigned_32 value. On return Ptr is bumped past the character. + + procedure Skip_UTF_Byte; + pragma Inline (Skip_UTF_Byte); + -- Skips past one encoded byte which must be 2#10xxxxxx# + + ---------- + -- Getc -- + ---------- + + procedure Getc is + begin + if Ptr > Input'Last then + Past_End; + else + U := Unsigned_32 (Character'Pos (Input (Ptr))); + Ptr := Ptr + 1; + end if; + end Getc; + + ------------------- + -- Skip_UTF_Byte -- + ------------------- + + procedure Skip_UTF_Byte is + begin + Getc; + + if (U and 2#11000000#) /= 2#10_000000# then + Bad; + end if; + end Skip_UTF_Byte; + + -- Start of processing for UTF-8 case + + begin + if Ptr < Input'First then + Past_End; + end if; + + -- 16#00_0000#-16#00_007F#: 0xxxxxxx + + Getc; + + if (U and 2#10000000#) = 2#00000000# then + null; + + -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx + + elsif (U and 2#11100000#) = 2#110_00000# then + Skip_UTF_Byte; + + -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx + + elsif (U and 2#11110000#) = 2#1110_0000# then + Skip_UTF_Byte; + Skip_UTF_Byte; + + -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + + elsif (U and 2#11111000#) = 2#11110_000# then + for K in 1 .. 3 loop + Skip_UTF_Byte; + end loop; + + -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx + + elsif (U and 2#11111100#) = 2#111110_00# then + for K in 1 .. 4 loop + Skip_UTF_Byte; + end loop; + + -- Any other code is invalid, note that this includes: + + -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx 10xxxxxx + + -- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF# + + else + Bad; + end if; + end UTF8; + + -- Non-UTF-8 cass + + else + declare + Discard : Wide_Wide_Character; + begin + Decode_Wide_Wide_Character (Input, Ptr, Discard); + end; + end if; + end Next_Wide_Wide_Character; + + -------------- + -- Past_End -- + -------------- + + procedure Past_End is + begin + raise Constraint_Error with "past end of string"; + end Past_End; + + ------------------------- + -- Prev_Wide_Character -- + ------------------------- + + procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is + begin + if Ptr > Input'Last + 1 then + Past_End; + end if; + + -- Special efficient encoding for UTF-8 case + + if Encoding_Method = WCEM_UTF8 then + UTF8 : declare + U : Unsigned_32; + + procedure Getc; + pragma Inline (Getc); + -- Gets the character at Input (Ptr - 1) and returns code in U as + -- Unsigned_32 value. On return Ptr is decremented by one. + + procedure Skip_UTF_Byte; + pragma Inline (Skip_UTF_Byte); + -- Checks that U is 2#10xxxxxx# and then calls Get + + ---------- + -- Getc -- + ---------- + + procedure Getc is + begin + if Ptr <= Input'First then + Past_End; + else + Ptr := Ptr - 1; + U := Unsigned_32 (Character'Pos (Input (Ptr))); + end if; + end Getc; + + ------------------- + -- Skip_UTF_Byte -- + ------------------- + + procedure Skip_UTF_Byte is + begin + if (U and 2#11000000#) = 2#10_000000# then + Getc; + else + Bad; + end if; + end Skip_UTF_Byte; + + -- Start of processing for UTF-8 case + + begin + -- 16#00_0000#-16#00_007F#: 0xxxxxxx + + Getc; + + if (U and 2#10000000#) = 2#00000000# then + return; + + -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx + + else + Skip_UTF_Byte; + + if (U and 2#11100000#) = 2#110_00000# then + return; + + -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx + + else + Skip_UTF_Byte; + + if (U and 2#11110000#) = 2#1110_0000# then + return; + + -- Any other code is invalid, note that this includes: + + -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx + -- 10xxxxxx + + -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx + -- 10xxxxxx 10xxxxxx + -- 10xxxxxx + + -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx + -- 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx + + -- since Wide_Character does not allow codes > 16#FFFF# + + else + Bad; + end if; + end if; + end if; + end UTF8; + + -- Special efficient encoding for brackets case + + elsif Encoding_Method = WCEM_Brackets then + Brackets : declare + P : Natural; + S : Natural; + + begin + -- See if we have "] at end positions + + if Ptr > Input'First + 1 + and then Input (Ptr - 1) = ']' + and then Input (Ptr - 2) = '"' + then + P := Ptr - 2; + + -- Loop back looking for [" at start + + while P >= Ptr - 10 loop + if P <= Input'First + 1 then + Bad; + + elsif Input (P - 1) = '"' + and then Input (P - 2) = '[' + then + -- Found ["..."], scan forward to check it + + S := P - 2; + P := S; + Next_Wide_Character (Input, P); + + -- OK if at original pointer, else error + + if P = Ptr then + Ptr := S; + return; + else + Bad; + end if; + end if; + + P := P - 1; + end loop; + + -- Falling through loop means more than 8 chars between the + -- enclosing brackets (or simply a missing left bracket) + + Bad; + + -- Here if no bracket sequence present + + else + if Ptr = Input'First then + Past_End; + else + Ptr := Ptr - 1; + end if; + end if; + end Brackets; + + -- Non-UTF-8/Brackets. These are the inefficient cases where we have to + -- go to the start of the string and skip forwards till Ptr matches. + + else + Non_UTF_Brackets : declare + Discard : Wide_Character; + PtrS : Natural; + PtrP : Natural; + + begin + PtrS := Input'First; + + if Ptr <= PtrS then + Past_End; + end if; + + loop + PtrP := PtrS; + Decode_Wide_Character (Input, PtrS, Discard); + + if PtrS = Ptr then + Ptr := PtrP; + return; + + elsif PtrS > Ptr then + Bad; + end if; + end loop; + + exception + when Constraint_Error => + Bad; + end Non_UTF_Brackets; + end if; + end Prev_Wide_Character; + + ------------------------------ + -- Prev_Wide_Wide_Character -- + ------------------------------ + + procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is + begin + if Ptr > Input'Last + 1 then + Past_End; + end if; + + -- Special efficient encoding for UTF-8 case + + if Encoding_Method = WCEM_UTF8 then + UTF8 : declare + U : Unsigned_32; + + procedure Getc; + pragma Inline (Getc); + -- Gets the character at Input (Ptr - 1) and returns code in U as + -- Unsigned_32 value. On return Ptr is decremented by one. + + procedure Skip_UTF_Byte; + pragma Inline (Skip_UTF_Byte); + -- Checks that U is 2#10xxxxxx# and then calls Get + + ---------- + -- Getc -- + ---------- + + procedure Getc is + begin + if Ptr <= Input'First then + Past_End; + else + Ptr := Ptr - 1; + U := Unsigned_32 (Character'Pos (Input (Ptr))); + end if; + end Getc; + + ------------------- + -- Skip_UTF_Byte -- + ------------------- + + procedure Skip_UTF_Byte is + begin + if (U and 2#11000000#) = 2#10_000000# then + Getc; + else + Bad; + end if; + end Skip_UTF_Byte; + + -- Start of processing for UTF-8 case + + begin + -- 16#00_0000#-16#00_007F#: 0xxxxxxx + + Getc; + + if (U and 2#10000000#) = 2#00000000# then + return; + + -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx + + else + Skip_UTF_Byte; + + if (U and 2#11100000#) = 2#110_00000# then + return; + + -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx + + else + Skip_UTF_Byte; + + if (U and 2#11110000#) = 2#1110_0000# then + return; + + -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx + -- 10xxxxxx + + else + Skip_UTF_Byte; + + if (U and 2#11111000#) = 2#11110_000# then + return; + + -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx + -- 10xxxxxx 10xxxxxx + -- 10xxxxxx + + else + Skip_UTF_Byte; + + if (U and 2#11111100#) = 2#111110_00# then + return; + + -- Any other code is invalid, note that this includes: + + -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx + -- 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx + + -- since Wide_Wide_Character does not allow codes + -- greater than 16#03FF_FFFF# + + else + Bad; + end if; + end if; + end if; + end if; + end if; + end UTF8; + + -- Special efficient encoding for brackets case + + elsif Encoding_Method = WCEM_Brackets then + Brackets : declare + P : Natural; + S : Natural; + + begin + -- See if we have "] at end positions + + if Ptr > Input'First + 1 + and then Input (Ptr - 1) = ']' + and then Input (Ptr - 2) = '"' + then + P := Ptr - 2; + + -- Loop back looking for [" at start + + while P >= Ptr - 10 loop + if P <= Input'First + 1 then + Bad; + + elsif Input (P - 1) = '"' + and then Input (P - 2) = '[' + then + -- Found ["..."], scan forward to check it + + S := P - 2; + P := S; + Next_Wide_Wide_Character (Input, P); + + -- OK if at original pointer, else error + + if P = Ptr then + Ptr := S; + return; + else + Bad; + end if; + end if; + + P := P - 1; + end loop; + + -- Falling through loop means more than 8 chars between the + -- enclosing brackets (or simply a missing left bracket) + + Bad; + + -- Here if no bracket sequence present + + else + if Ptr = Input'First then + Past_End; + else + Ptr := Ptr - 1; + end if; + end if; + end Brackets; + + -- Non-UTF-8/Brackets. These are the inefficient cases where we have to + -- go to the start of the string and skip forwards till Ptr matches. + + else + Non_UTF8_Brackets : declare + Discard : Wide_Wide_Character; + PtrS : Natural; + PtrP : Natural; + + begin + PtrS := Input'First; + + if Ptr <= PtrS then + Past_End; + end if; + + loop + PtrP := PtrS; + Decode_Wide_Wide_Character (Input, PtrS, Discard); + + if PtrS = Ptr then + Ptr := PtrP; + return; + + elsif PtrS > Ptr then + Bad; + end if; + end loop; + + exception + when Constraint_Error => + Bad; + end Non_UTF8_Brackets; + end if; + end Prev_Wide_Wide_Character; + + -------------------------- + -- Validate_Wide_String -- + -------------------------- + + function Validate_Wide_String (S : String) return Boolean is + Ptr : Natural; + + begin + Ptr := S'First; + while Ptr <= S'Last loop + Next_Wide_Character (S, Ptr); + end loop; + + return True; + + exception + when Constraint_Error => + return False; + end Validate_Wide_String; + + ------------------------------- + -- Validate_Wide_Wide_String -- + ------------------------------- + + function Validate_Wide_Wide_String (S : String) return Boolean is + Ptr : Natural; + + begin + Ptr := S'First; + while Ptr <= S'Last loop + Next_Wide_Wide_Character (S, Ptr); + end loop; + + return True; + + exception + when Constraint_Error => + return False; + end Validate_Wide_Wide_String; + +end GNAT.Decode_String; diff --git a/gcc/ada/g-decstr.ads b/gcc/ada/g-decstr.ads new file mode 100755 index 00000000000..07d501552e0 --- /dev/null +++ b/gcc/ada/g-decstr.ads @@ -0,0 +1,163 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT RUN-TIME COMPONENTS -- +-- -- +-- G N A T . D E C O D E _ S T R I N G -- +-- -- +-- S p e c -- +-- -- +-- Copyright (C) 2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +-- This generic package provides utility routines for converting from an +-- encoded string to a corresponding Wide_String or Wide_Wide_String value +-- using a specified encoding convention, which is supplied as the generic +-- parameter. UTF-8 is handled especially efficiently, and if the encoding +-- method is known at compile time to be WCEM_UTF8, then the instantiation +-- is specialized to handle only the UTF-8 case and exclude code for the +-- other encoding methods. The package also provides positioning routines +-- for skipping encoded characters in either direction, and for validating +-- strings for correct encodings. + +-- Note: this package is only about decoding sequences of 8-bit characters +-- into corresponding 16-bit Wide_String or 32-bit Wide_Wide_String values. +-- It knows nothing at all about the character encodings being used for the +-- resulting Wide_Character and Wide_Wide_Character values. Most often this +-- will be Unicode/ISO-10646 as specified by the Ada RM, but this package +-- does not make any assumptions about the character coding. See also the +-- packages Ada.Wide_[Wide_]Characters.Unicode for unicode specific functions. + +-- Note on the use of brackets encoding (WCEM_Brackets). The brackets encoding +-- method is ambiguous in the context of this package, since there is no way +-- to tell if ["1234"] is eight unencoded characters or one encoded character. +-- In the context of Ada sources, any sequence starting [" must be the start +-- of an encoding (since that sequence is not valid in Ada source otherwise). +-- The routines in this package use the same approach. If the input string +-- contains the sequence [" then this is assumed to be the start of a brackets +-- encoding sequence, and if it does not match the syntax, an error is raised. +-- In the case of the Prev functions, a sequence ending with "] is assumed to +-- be a valid brackets sequence, and an error is raised if it is not. + +with System.WCh_Con; + +generic + Encoding_Method : System.WCh_Con.WC_Encoding_Method; + +package GNAT.Decode_String is + pragma Pure; + + function Decode_Wide_String (S : String) return Wide_String; + pragma Inline (Decode_Wide_String); + -- Decode the given String, which is encoded using the indicated coding + -- method, returning the corresponding decoded Wide_String value. If S + -- contains a character code that cannot be represented with the given + -- encoding, then Constraint_Error is raised. + + procedure Decode_Wide_String + (S : String; + Result : out Wide_String; + Length : out Natural); + -- Similar to the above function except that the result is stored in the + -- given Wide_String variable Result, starting at Result (Result'First). On + -- return, Length is set to the number of characters stored in Result. The + -- caller must ensure that Result is long enough (an easy choice is to set + -- the length equal to the S'Length, since decoding can never increase the + -- string length). If the length of Result is insufficient Constraint_Error + -- will be raised. + + function Decode_Wide_Wide_String (S : String) return Wide_Wide_String; + pragma Inline (Decode_Wide_Wide_String); + -- Same as above function but for Wide_Wide_String output + + procedure Decode_Wide_Wide_String + (S : String; + Result : out Wide_Wide_String; + Length : out Natural); + -- Same as above procedure, but for Wide_Wide_String output + + function Validate_Wide_String (S : String) return Boolean; + -- This function inspects the string S to determine if it contains only + -- valid encodings corresponding to Wide_Character values using the + -- given encoding. If a call to Decode_Wide_String (S) would return + -- without raising Constraint_Error, then Validate_Wide_String will + -- return True. If the call would have raised Constraint_Error, then + -- Validate_Wide_String will return False. + + function Validate_Wide_Wide_String (S : String) return Boolean; + -- Similar to Validate_Wide_String, except that it succeeds if the string + -- contains only encodings corresponding to Wide_Wide_Character values. + + procedure Decode_Wide_Character + (Input : String; + Ptr : in out Natural; + Result : out Wide_Character); + pragma Inline (Decode_Wide_Character); + -- This is a lower level procedure that decodes a single character using + -- the given encoding method. The encoded character is stored in Input, + -- starting at Input (Ptr). The resulting output character is stored in + -- Result, and on return Ptr is updated past the input character or + -- encoding sequence. Constraint_Error will be raised if the input has + -- has a character that cannot be represented using the given encoding, + -- or if Ptr is outside the bounds of the Input string. + + procedure Decode_Wide_Wide_Character + (Input : String; + Ptr : in out Natural; + Result : out Wide_Wide_Character); + -- Same as above procedure but with Wide_Wide_Character input + + procedure Next_Wide_Character (Input : String; Ptr : in out Natural); + -- This procedure examines the input string starting at Input (Ptr), and + -- advances Ptr past one character in the encoded string, so that on return + -- Ptr points to the next encoded character. Constraint_Error is raised if + -- an invalid encoding is encountered, or the end of the string is reached + -- or if Ptr is less than String'First on entry, or if the character + -- skipped is not a valid Wide_Character code. This call may be more + -- efficient than calling Decode_Wide_Character and discarding the result. + + procedure Prev_Wide_Character (Input : String; Ptr : in out Natural); + -- This procedure is similar to Next_Encoded_Character except that it moves + -- backwards in the string, so that on return, Ptr is set to point to the + -- previous encoded character. Constraint_Error is raised if the start of + -- the string is encountered. It is valid for Ptr to be one past the end + -- of the string for this call (in which case on return it will point to + -- the last encoded character). + -- + -- Note: it is not generally possible to do this function efficiently with + -- all encodings, the current implementation is only efficient for the case + -- of UTF-8 (Encoding_Method = WCEM_UTF8) and Brackets (Encoding_Method = + -- WCEM_Brackets). For all other encodings, we work by starting at the + -- beginning of the string and moving forward till Ptr is reached, which + -- is correct but slow. + + procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural); + -- Similar to Next_Wide_Character except that codes skipped must be valid + -- Wide_Wide_Character codes. + + procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural); + -- Similar to Prev_Wide_Character except that codes skipped must be valid + -- Wide_Wide_Character codes. + +end GNAT.Decode_String; diff --git a/gcc/ada/g-deutst.ads b/gcc/ada/g-deutst.ads new file mode 100644 index 00000000000..ca03ace5260 --- /dev/null +++ b/gcc/ada/g-deutst.ads @@ -0,0 +1,45 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT RUN-TIME COMPONENTS -- +-- -- +-- G N A T . D E C O D E _ U T F 8 _ S T R I N G -- +-- -- +-- S p e c -- +-- -- +-- Copyright (C) 2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +-- This package provides a pre-instantiation of GNAT.Decode_String for the +-- common case of UTF-8 encoding. As noted in the documentation of that +-- package, this UTF-8 instantiation is efficient and specialized so that +-- it has only the code for the UTF-8 case. See g-decstr.ads for full +-- documentation on this package. + +with GNAT.Decode_String; + +with System.WCh_Con; + +package GNAT.Decode_UTF8_String is + new GNAT.Decode_String (System.WCh_Con.WCEM_UTF8); diff --git a/gcc/ada/g-encstr.adb b/gcc/ada/g-encstr.adb new file mode 100755 index 00000000000..6f1411693fe --- /dev/null +++ b/gcc/ada/g-encstr.adb @@ -0,0 +1,260 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT RUN-TIME COMPONENTS -- +-- -- +-- G N A T . E N C O D E _ S T R I N G -- +-- -- +-- B o d y -- +-- -- +-- Copyright (C) 2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +with Interfaces; use Interfaces; + +with System.WCh_Con; use System.WCh_Con; +with System.WCh_Cnv; use System.WCh_Cnv; + +package body GNAT.Encode_String is + + ----------------------- + -- Local Subprograms -- + ----------------------- + + procedure Bad; + pragma No_Return (Bad); + -- Raise error for bad character code + + procedure Past_End; + pragma No_Return (Past_End); + -- Raise error for off end of string + + --------- + -- Bad -- + --------- + + procedure Bad is + begin + raise Constraint_Error with + "character cannot be encoded with given Encoding_Method"; + end Bad; + + ------------------------ + -- Encode_Wide_String -- + ------------------------ + + function Encode_Wide_String (S : Wide_String) return String is + Long : constant Natural := WC_Longest_Sequences (Encoding_Method); + Result : String (1 .. S'Length * Long); + Length : Natural; + begin + Encode_Wide_String (S, Result, Length); + return Result (1 .. Length); + end Encode_Wide_String; + + procedure Encode_Wide_String + (S : Wide_String; + Result : out String; + Length : out Natural) + is + Ptr : Natural; + + begin + Ptr := S'First; + for J in S'Range loop + Encode_Wide_Character (S (J), Result, Ptr); + end loop; + + Length := Ptr - S'First; + end Encode_Wide_String; + + ----------------------------- + -- Encode_Wide_Wide_String -- + ----------------------------- + + function Encode_Wide_Wide_String (S : Wide_Wide_String) return String is + Long : constant Natural := WC_Longest_Sequences (Encoding_Method); + Result : String (1 .. S'Length * Long); + Length : Natural; + begin + Encode_Wide_Wide_String (S, Result, Length); + return Result (1 .. Length); + end Encode_Wide_Wide_String; + + procedure Encode_Wide_Wide_String + (S : Wide_Wide_String; + Result : out String; + Length : out Natural) + is + Ptr : Natural; + + begin + Ptr := S'First; + for J in S'Range loop + Encode_Wide_Wide_Character (S (J), Result, Ptr); + end loop; + + Length := Ptr - S'First; + end Encode_Wide_Wide_String; + + --------------------------- + -- Encode_Wide_Character -- + --------------------------- + + procedure Encode_Wide_Character + (Char : Wide_Character; + Result : in out String; + Ptr : in out Natural) + is + begin + Encode_Wide_Wide_Character + (Wide_Wide_Character'Val (Wide_Character'Pos (Char)), Result, Ptr); + + exception + when Constraint_Error => + Bad; + end Encode_Wide_Character; + + -------------------------------- + -- Encode_Wide_Wide_Character -- + -------------------------------- + + procedure Encode_Wide_Wide_Character + (Char : Wide_Wide_Character; + Result : in out String; + Ptr : in out Natural) + is + U : Unsigned_32; + + procedure Out_Char (C : Character); + pragma Inline (Out_Char); + -- Procedure to store one character for instantiation below + + -------------- + -- Out_Char -- + -------------- + + procedure Out_Char (C : Character) is + begin + if Ptr > Result'Last then + Past_End; + else + Result (Ptr) := C; + Ptr := Ptr + 1; + end if; + end Out_Char; + + -- Start of processing for Encode_Wide_Wide_Character; + + begin + -- Efficient code for UTF-8 case + + if Encoding_Method = WCEM_UTF8 then + + -- Note: for details of UTF8 encoding see RFC 3629 + + U := Unsigned_32 (Wide_Wide_Character'Pos (Char)); + + -- 16#00_0000#-16#00_007F#: 0xxxxxxx + + if U <= 16#00_007F# then + Out_Char (Character'Val (U)); + + -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx + + elsif U <= 16#00_07FF# then + Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6))); + Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); + + -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx + + elsif U <= 16#00_FFFF# then + Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12))); + Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) + and 2#00111111#))); + Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); + + -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + + elsif U <= 16#10_FFFF# then + Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18))); + Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) + and 2#00111111#))); + Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) + and 2#00111111#))); + Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); + + -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx + + elsif U <= 16#03FF_FFFF# then + Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24))); + Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18) + and 2#00111111#))); + Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) + and 2#00111111#))); + Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) + and 2#00111111#))); + Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); + + -- All other cases are invalid character codes, not this includes: + + -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx + -- 10xxxxxx 10xxxxxx 10xxxxxx + + -- since Wide_Wide_Character values cannot exceed 16#3F_FFFF# + + else + Bad; + end if; + + -- All encoding methods other than UTF-8 + + else + Non_UTF8 : declare + procedure UTF_32_To_String is + new UTF_32_To_Char_Sequence (Out_Char); + -- Instantiate conversion procedure with above Out_Char routine + + begin + UTF_32_To_String + (UTF_32_Code (Wide_Wide_Character'Pos (Char)), Encoding_Method); + + exception + when Constraint_Error => + Bad; + end Non_UTF8; + end if; + end Encode_Wide_Wide_Character; + + -------------- + -- Past_End -- + -------------- + + procedure Past_End is + begin + raise Constraint_Error with "past end of string"; + end Past_End; + +end GNAT.Encode_String; diff --git a/gcc/ada/g-encstr.ads b/gcc/ada/g-encstr.ads new file mode 100755 index 00000000000..5862bbe182d --- /dev/null +++ b/gcc/ada/g-encstr.ads @@ -0,0 +1,111 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT RUN-TIME COMPONENTS -- +-- -- +-- G N A T . E N C O D E _ S T R I N G -- +-- -- +-- S p e c -- +-- -- +-- Copyright (C) 2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +-- This generic package provides utility routines for converting from +-- Wide_String or Wide_Wide_String to encoded String using a specified +-- encoding convention, which is supplied as the generic parameter. If +-- this parameter is a known at compile time constant (e.g. a constant +-- definned in System.WCh_Con), the instantiation is specialized so that +-- it applies only to this specified coding. + +-- Note: this package is only about encoding sequences of 16- or 32-bit +-- characters into a sequence of 8-bit codes. It knows nothing at all about +-- the character encodings being used for the input Wide_Character and +-- Wide_Wide_Character values, although some of the encoding methods (notably +-- JIS and EUC) have built in assumptions about the range of possible input +-- code values. Most often the input will be Unicode/ISO-10646 as specified by +-- the Ada RM, but this package does not make any assumptions about the +-- character coding, and in the case of UTF-8 all possible code values can be +-- encoded. See also the packages Ada.Wide_[Wide_]Characters.Unicode for +-- unicode specific functions. + +-- Note on brackets encoding (WCEM_Brackets). On input, upper half characters +-- can be represented as ["hh"] but the routines in this package will only use +-- brackets encodings for codes higher than 16#FF#, so upper half characters +-- will be output as single Character values. + +with System.WCh_Con; + +generic + Encoding_Method : System.WCh_Con.WC_Encoding_Method; + +package GNAT.Encode_String is + pragma Pure; + + function Encode_Wide_String (S : Wide_String) return String; + pragma Inline (Encode_Wide_String); + -- Encode the given Wide_String, returning a String encoded using the + -- given encoding method. Constraint_Error will be raised if the encoding + -- method cannot accomodate the input data. + + procedure Encode_Wide_String + (S : Wide_String; + Result : out String; + Length : out Natural); + -- Encode the given Wide_String, storing the encoded string in Result, + -- with Length being set to the length of the encoded string. The caller + -- must ensure that Result is long enough (see useful constants defined + -- in System.WCh_Con: WC_Longest_Sequence, WC_Longest_Sequences). If the + -- length of Result is insufficient Constraint_Error will be raised. + -- Constraint_Error will also be raised if the encoding method cannot + -- accomodate the input data. + + function Encode_Wide_Wide_String (S : Wide_Wide_String) return String; + pragma Inline (Encode_Wide_Wide_String); + -- Same as above function but for Wide_Wide_String input + + procedure Encode_Wide_Wide_String + (S : Wide_Wide_String; + Result : out String; + Length : out Natural); + -- Same as above procedure, but for Wide_Wide_String input + + procedure Encode_Wide_Character + (Char : Wide_Character; + Result : in out String; + Ptr : in out Natural); + pragma Inline (Encode_Wide_Character); + -- This is a lower level procedure that encodes the single character Char. + -- The output is stored in Result starting at Result (Ptr), and Ptr is + -- updated past the stored value. Constraint_Error is raised if Result + -- is not long enough to accomodate the result, or if the encoding method + -- specified does not accomodate the input character value, or if Ptr is + -- outside the bounds of the Result string. + + procedure Encode_Wide_Wide_Character + (Char : Wide_Wide_Character; + Result : in out String; + Ptr : in out Natural); + -- Same as above procedure but with Wide_Wide_Character input + +end GNAT.Encode_String; diff --git a/gcc/ada/g-enutst.ads b/gcc/ada/g-enutst.ads new file mode 100644 index 00000000000..3c4632866bf --- /dev/null +++ b/gcc/ada/g-enutst.ads @@ -0,0 +1,45 @@ +------------------------------------------------------------------------------ +-- -- +-- GNAT RUN-TIME COMPONENTS -- +-- -- +-- G N A T . E N C O D E _ U T F 8 _ S T R I N G -- +-- -- +-- S p e c -- +-- -- +-- Copyright (C) 2007, AdaCore -- +-- -- +-- GNAT is free software; you can redistribute it and/or modify it under -- +-- terms of the GNU General Public License as published by the Free Soft- -- +-- ware Foundation; either version 2, or (at your option) any later ver- -- +-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- +-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- +-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -- +-- for more details. You should have received a copy of the GNU General -- +-- Public License distributed with GNAT; see file COPYING. If not, write -- +-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, -- +-- Boston, MA 02110-1301, USA. -- +-- -- +-- As a special exception, if other files instantiate generics from this -- +-- unit, or you link this unit with other files to produce an executable, -- +-- this unit does not by itself cause the resulting executable to be -- +-- covered by the GNU General Public License. This exception does not -- +-- however invalidate any other reasons why the executable file might be -- +-- covered by the GNU Public License. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +-- This package provides a pre-instantiation of GNAT.Encode_String for the +-- common case of UTF-8 encoding. As noted in the documentation of that +-- package, this UTF-8 instantiation is efficient and specialized so that +-- it has only the code for the UTF-8 case. See g-encstr.ads for full +-- documentation on this package. + +with GNAT.Encode_String; + +with System.WCh_Con; + +package GNAT.Encode_UTF8_String is + new GNAT.Encode_String (System.WCh_Con.WCEM_UTF8); diff --git a/gcc/ada/scn.adb b/gcc/ada/scn.adb index f970d84ab98..d4e7bd387d9 100644 --- a/gcc/ada/scn.adb +++ b/gcc/ada/scn.adb @@ -28,6 +28,7 @@ with Csets; use Csets; with Hostparm; use Hostparm; with Namet; use Namet; with Opt; use Opt; +with Output; use Output; with Restrict; use Restrict; with Rident; use Rident; with Scans; use Scans; @@ -35,6 +36,10 @@ with Sinfo; use Sinfo; with Sinput; use Sinput; with Uintp; use Uintp; +with GNAT.Byte_Order_Mark; use GNAT.Byte_Order_Mark; + +with System.WCh_Con; use System.WCh_Con; + package body Scn is use ASCII; @@ -266,6 +271,42 @@ package body Scn is Set_License (Current_Source_File, Determine_License); end if; + -- Check for BOM + + declare + BOM : BOM_Kind; + Len : Natural; + Tst : String (1 .. 5); + + begin + for J in 1 .. 5 loop + Tst (J) := Source (Scan_Ptr + Source_Ptr (J) - 1); + end loop; + + Read_BOM (Tst, Len, BOM, False); + + case BOM is + when UTF8_All => + Scan_Ptr := Scan_Ptr + Source_Ptr (Len); + Wide_Character_Encoding_Method := WCEM_UTF8; + Upper_Half_Encoding := True; + + when UTF16_LE | UTF16_BE => + Write_Line ("UTF-16 encoding format not recognized"); + raise Unrecoverable_Error; + + when UTF32_LE | UTF32_BE => + Write_Line ("UTF-32 encoding format not recognized"); + raise Unrecoverable_Error; + + when Unknown => + null; + + when others => + raise Program_Error; + end case; + end; + -- Because of the License stuff above, Scng.Initialize_Scanner cannot -- call Scan. Scan initial token (note this initializes Prev_Token, -- Prev_Token_Ptr). |