summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorcharlet <charlet@138bc75d-0d04-0410-961f-82ee72b054a4>2007-12-13 10:29:24 +0000
committercharlet <charlet@138bc75d-0d04-0410-961f-82ee72b054a4>2007-12-13 10:29:24 +0000
commitc18655b9ce902c81ab97367927cf579421e6a6fb (patch)
tree43bd0ad62150abfc2319fc29f19db59d9db58ebc
parentc2be248db6c5dd014b6210e4b711260ca5fbd172 (diff)
downloadppe42-gcc-c18655b9ce902c81ab97367927cf579421e6a6fb.tar.gz
ppe42-gcc-c18655b9ce902c81ab97367927cf579421e6a6fb.zip
2007-12-06 Robert Dewar <dewar@adacore.com>
* g-byorma.adb, g-byorma.ads, g-decstr.adb, g-decstr.ads, g-deutst.ads, g-encstr.adb, g-encstr.ads, g-enutst.ads: New files. * scn.adb: Implement BOM recognition git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@130849 138bc75d-0d04-0410-961f-82ee72b054a4
-rwxr-xr-xgcc/ada/g-byorma.adb191
-rwxr-xr-xgcc/ada/g-byorma.ads104
-rwxr-xr-xgcc/ada/g-decstr.adb972
-rwxr-xr-xgcc/ada/g-decstr.ads163
-rw-r--r--gcc/ada/g-deutst.ads45
-rwxr-xr-xgcc/ada/g-encstr.adb260
-rwxr-xr-xgcc/ada/g-encstr.ads111
-rw-r--r--gcc/ada/g-enutst.ads45
-rw-r--r--gcc/ada/scn.adb41
9 files changed, 1932 insertions, 0 deletions
diff --git a/gcc/ada/g-byorma.adb b/gcc/ada/g-byorma.adb
new file mode 100755
index 00000000000..9cc6f08b519
--- /dev/null
+++ b/gcc/ada/g-byorma.adb
@@ -0,0 +1,191 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT COMPILER COMPONENTS --
+-- --
+-- G N A T . B Y T E _ O R D E R _ M A R K --
+-- --
+-- B o d y --
+-- --
+-- Copyright (C) 2006-2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+package body GNAT.Byte_Order_Mark is
+
+ --------------
+ -- Read_BOM --
+ --------------
+
+ procedure Read_BOM
+ (Str : String;
+ Len : out Natural;
+ BOM : out BOM_Kind;
+ XML_Support : Boolean := False)
+ is
+ begin
+ -- UTF-16 (big-endian)
+
+ if Str'Length >= 2
+ and then Str (Str'First) = Character'Val (16#FE#)
+ and then Str (Str'First + 1) = Character'Val (16#FF#)
+ then
+ Len := 2;
+ BOM := UTF16_BE;
+
+ -- UTF-16 (little-endian)
+
+ elsif Str'Length >= 2
+ and then Str (Str'First) = Character'Val (16#FF#)
+ and then Str (Str'First + 1) = Character'Val (16#FE#)
+ then
+ Len := 2;
+ BOM := UTF16_LE;
+
+ -- UTF-32 (big-endian)
+
+ elsif Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#00#)
+ and then Str (Str'First + 1) = Character'Val (16#00#)
+ and then Str (Str'First + 2) = Character'Val (16#FE#)
+ and then Str (Str'First + 3) = Character'Val (16#FF#)
+ then
+ Len := 4;
+ BOM := UTF32_BE;
+
+ -- UTF-32 (little-endian)
+
+ elsif Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#FF#)
+ and then Str (Str'First + 1) = Character'Val (16#FE#)
+ and then Str (Str'First + 2) = Character'Val (16#00#)
+ and then Str (Str'First + 3) = Character'Val (16#00#)
+ then
+ Len := 4;
+ BOM := UTF32_LE;
+
+ -- UTF-8 (endian-independent)
+
+ elsif Str'Length >= 3
+ and then Str (Str'First) = Character'Val (16#EF#)
+ and then Str (Str'First + 1) = Character'Val (16#BB#)
+ and then Str (Str'First + 2) = Character'Val (16#BF#)
+ then
+ Len := 3;
+ BOM := UTF8_All;
+
+ -- UCS-4 (big-endian) XML only
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#00#)
+ and then Str (Str'First + 1) = Character'Val (16#00#)
+ and then Str (Str'First + 2) = Character'Val (16#00#)
+ and then Str (Str'First + 3) = Character'Val (16#3C#)
+ then
+ Len := 0;
+ BOM := UCS4_BE;
+
+ -- UCS-4 (little-endian) XML case
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#3C#)
+ and then Str (Str'First + 1) = Character'Val (16#00#)
+ and then Str (Str'First + 2) = Character'Val (16#00#)
+ and then Str (Str'First + 3) = Character'Val (16#00#)
+ then
+ Len := 0;
+ BOM := UCS4_LE;
+
+ -- UCS-4 (unusual byte order 2143) XML case
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#00#)
+ and then Str (Str'First + 1) = Character'Val (16#00#)
+ and then Str (Str'First + 2) = Character'Val (16#3C#)
+ and then Str (Str'First + 3) = Character'Val (16#00#)
+ then
+ Len := 0;
+ BOM := UCS4_2143;
+
+ -- UCS-4 (unusual byte order 3412) XML case
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#00#)
+ and then Str (Str'First + 1) = Character'Val (16#3C#)
+ and then Str (Str'First + 2) = Character'Val (16#00#)
+ and then Str (Str'First + 3) = Character'Val (16#00#)
+ then
+ Len := 0;
+ BOM := UCS4_3412;
+
+ -- UTF-16 (big-endian) XML case
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#00#)
+ and then Str (Str'First + 1) = Character'Val (16#3C#)
+ and then Str (Str'First + 2) = Character'Val (16#00#)
+ and then Str (Str'First + 3) = Character'Val (16#3F#)
+ then
+ Len := 0;
+ BOM := UTF16_BE;
+
+ -- UTF-32 (little-endian) XML case
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#3C#)
+ and then Str (Str'First + 1) = Character'Val (16#00#)
+ and then Str (Str'First + 2) = Character'Val (16#3F#)
+ and then Str (Str'First + 3) = Character'Val (16#00#)
+ then
+ Len := 0;
+ BOM := UTF16_LE;
+
+ -- Unrecognized special encodings XML only
+
+ elsif XML_Support
+ and then Str'Length >= 4
+ and then Str (Str'First) = Character'Val (16#3C#)
+ and then Str (Str'First + 1) = Character'Val (16#3F#)
+ and then Str (Str'First + 2) = Character'Val (16#78#)
+ and then Str (Str'First + 3) = Character'Val (16#6D#)
+ then
+ -- Utf8, ASCII, some part of ISO8859, Shift-JIS, EUC,...
+
+ Len := 0;
+ BOM := Unknown;
+
+ -- No BOM recognized
+
+ else
+ Len := 0;
+ BOM := Unknown;
+ end if;
+ end Read_BOM;
+
+end GNAT.Byte_Order_Mark;
diff --git a/gcc/ada/g-byorma.ads b/gcc/ada/g-byorma.ads
new file mode 100755
index 00000000000..6016f755bcc
--- /dev/null
+++ b/gcc/ada/g-byorma.ads
@@ -0,0 +1,104 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT COMPILER COMPONENTS --
+-- --
+-- G N A T . B Y T E _ O R D E R _ M A R K --
+-- --
+-- S p e c --
+-- --
+-- Copyright (C) 2006-2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+-- This package provides a procedure for reading and interpreting the BOM
+-- (byte order mark) used to publish the encoding method for a string (for
+-- example, a UTF-8 encoded file in windows will start with the appropriate
+-- BOM sequence to signal UTF-8 encoding.
+
+-- There are two cases
+
+-- Case 1. UTF encodings for Unicode files
+
+-- Here the convention is to have the first character of the file be a
+-- non-breaking zero width space character (16#0000_FEFF#). For the UTF
+-- encodings, the representation of this character can be used to uniquely
+-- determine the encoding. Furthermore, the possibility of any confusion
+-- with unencoded files is minimal, since for example the UTF-8 encoding
+-- of this character looks like the sequence:
+
+-- LC_I_Diaeresis
+-- Right_Angle_Quotation
+-- Fraction_One_Half
+
+-- which is so unlikely to occur legitimately in normal use that it can
+-- safely be ignored in most cases (for example, no legitimate Ada source
+-- file could start with this sequence of characters).
+
+-- Case 2. Specialized XML encodings
+
+-- The XML standard defines a number of other possible encodings and also
+-- defines standardized sequences for marking these encodings. This package
+-- can also optionally handle these XML defined BOM sequences. These XML
+-- cases depend on the first character of the XML file being < so that the
+-- encoding of this character can be recognized.
+
+pragma Warnings (Off);
+pragma Compiler_Unit;
+pragma Warnings (On);
+
+package GNAT.Byte_Order_Mark is
+
+ type BOM_Kind is
+ (UTF8_All, -- UTF8-encoding
+ UTF16_LE, -- UTF16 little-endian encoding
+ UTF16_BE, -- UTF16 big-endian encoding
+ UTF32_LE, -- UTF32 little-endian encoding
+ UTF32_BE, -- UTF32 big-endian encoding
+
+ -- The following cases are for XML only
+
+ UCS4_BE, -- UCS-4, big endian machine (1234 order)
+ UCS4_LE, -- UCS-4, little endian machine (4321 order)
+ UCS4_2143, -- UCS-4, unusual byte order (2143 order)
+ UCS4_3412, -- UCS-4, unusual byte order (3412 order)
+
+ -- Value returned if no BOM recognized
+
+ Unknown); -- Unknown, assumed to be ASCII compatible
+
+ procedure Read_BOM
+ (Str : String;
+ Len : out Natural;
+ BOM : out BOM_Kind;
+ XML_Support : Boolean := False);
+ -- This is the routine to read the BOM from the start of the given string
+ -- Str. On return BOM is set to the appropriate BOM_Kind and Len is set to
+ -- its length. The caller will typically skip the first Len characters in
+ -- the string to ignore the BOM sequence. The special XML possibilities are
+ -- recognized only if flag XML_Support is set to True. Note that for the
+ -- XML cases, Len is always set to zero on return (not to the length of the
+ -- relevant sequence) since in the XML cases, the sequence recognized is
+ -- for the first real character in the file (<) which is not to be skipped.
+
+end GNAT.Byte_Order_Mark;
diff --git a/gcc/ada/g-decstr.adb b/gcc/ada/g-decstr.adb
new file mode 100755
index 00000000000..580ad15a4a6
--- /dev/null
+++ b/gcc/ada/g-decstr.adb
@@ -0,0 +1,972 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- G N A T . D E C O D E _ S T R I N G --
+-- --
+-- S p e c --
+-- --
+-- Copyright (C) 2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+-- This package provides a utility routine for converting from an encoded
+-- string to a corresponding Wide_String or Wide_Wide_String value.
+
+with Interfaces; use Interfaces;
+
+with System.WCh_Cnv; use System.WCh_Cnv;
+with System.WCh_Con; use System.WCh_Con;
+
+package body GNAT.Decode_String is
+
+ -----------------------
+ -- Local Subprograms --
+ -----------------------
+
+ procedure Bad;
+ pragma No_Return (Bad);
+ -- Raise error for bad encoding
+
+ procedure Past_End;
+ pragma No_Return (Past_End);
+ -- Raise error for off end of string
+
+ ---------
+ -- Bad --
+ ---------
+
+ procedure Bad is
+ begin
+ raise Constraint_Error with
+ "bad encoding or character out of range";
+ end Bad;
+
+ ---------------------------
+ -- Decode_Wide_Character --
+ ---------------------------
+
+ procedure Decode_Wide_Character
+ (Input : String;
+ Ptr : in out Natural;
+ Result : out Wide_Character)
+ is
+ Char : Wide_Wide_Character;
+ begin
+ Decode_Wide_Wide_Character (Input, Ptr, Char);
+
+ if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
+ Bad;
+ else
+ Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
+ end if;
+ end Decode_Wide_Character;
+
+ ------------------------
+ -- Decode_Wide_String --
+ ------------------------
+
+ function Decode_Wide_String (S : String) return Wide_String is
+ Result : Wide_String (1 .. S'Length);
+ Length : Natural;
+ begin
+ Decode_Wide_String (S, Result, Length);
+ return Result (1 .. Length);
+ end Decode_Wide_String;
+
+ procedure Decode_Wide_String
+ (S : String;
+ Result : out Wide_String;
+ Length : out Natural)
+ is
+ Ptr : Natural;
+
+ begin
+ Ptr := S'First;
+ Length := 0;
+ while Ptr <= S'Last loop
+ if Length >= Result'Last then
+ Past_End;
+ end if;
+
+ Length := Length + 1;
+ Decode_Wide_Character (S, Ptr, Result (Length));
+ end loop;
+ end Decode_Wide_String;
+
+ --------------------------------
+ -- Decode_Wide_Wide_Character --
+ --------------------------------
+
+ procedure Decode_Wide_Wide_Character
+ (Input : String;
+ Ptr : in out Natural;
+ Result : out Wide_Wide_Character)
+ is
+ C : Character;
+
+ function In_Char return Character;
+ pragma Inline (In_Char);
+ -- Function to get one input character
+
+ -------------
+ -- In_Char --
+ -------------
+
+ function In_Char return Character is
+ begin
+ if Ptr <= Input'Last then
+ Ptr := Ptr + 1;
+ return Input (Ptr - 1);
+ else
+ Past_End;
+ end if;
+ end In_Char;
+
+ -- Start of processing for Decode_Wide_Wide_Character
+
+ begin
+ C := In_Char;
+
+ -- Special fast processing for UTF-8 case
+
+ if Encoding_Method = WCEM_UTF8 then
+ UTF8 : declare
+ U : Unsigned_32;
+ W : Unsigned_32;
+
+ procedure Get_UTF_Byte;
+ pragma Inline (Get_UTF_Byte);
+ -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
+ -- Reads a byte, and raises CE if the first two bits are not 10.
+ -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
+
+ ------------------
+ -- Get_UTF_Byte --
+ ------------------
+
+ procedure Get_UTF_Byte is
+ begin
+ U := Unsigned_32 (Character'Pos (In_Char));
+
+ if (U and 2#11000000#) /= 2#10_000000# then
+ Bad;
+ end if;
+
+ W := Shift_Left (W, 6) or (U and 2#00111111#);
+ end Get_UTF_Byte;
+
+ -- Start of processing for UTF8 case
+
+ begin
+ -- Note: for details of UTF8 encoding see RFC 3629
+
+ U := Unsigned_32 (Character'Pos (C));
+
+ -- 16#00_0000#-16#00_007F#: 0xxxxxxx
+
+ if (U and 2#10000000#) = 2#00000000# then
+ Result := Wide_Wide_Character'Val (Character'Pos (C));
+
+ -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+ elsif (U and 2#11100000#) = 2#110_00000# then
+ W := U and 2#00011111#;
+ Get_UTF_Byte;
+ Result := Wide_Wide_Character'Val (W);
+
+ -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11110000#) = 2#1110_0000# then
+ W := U and 2#00001111#;
+ Get_UTF_Byte;
+ Get_UTF_Byte;
+ Result := Wide_Wide_Character'Val (W);
+
+ -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11111000#) = 2#11110_000# then
+ W := U and 2#00000111#;
+
+ for K in 1 .. 3 loop
+ Get_UTF_Byte;
+ end loop;
+
+ Result := Wide_Wide_Character'Val (W);
+
+ -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11111100#) = 2#111110_00# then
+ W := U and 2#00000011#;
+
+ for K in 1 .. 4 loop
+ Get_UTF_Byte;
+ end loop;
+
+ Result := Wide_Wide_Character'Val (W);
+
+ -- All other cases are invalid, note that this includes:
+
+ -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx 10xxxxxx
+
+ -- since Wide_Wide_Character does not include code values
+ -- greater than 16#03FF_FFFF#.
+
+ else
+ Bad;
+ end if;
+ end UTF8;
+
+ -- All encoding functions other than UTF-8
+
+ else
+ Non_UTF8 : declare
+ function Char_Sequence_To_UTF is
+ new Char_Sequence_To_UTF_32 (In_Char);
+
+ begin
+ -- For brackets, must test for specific case of [ not followed by
+ -- quotation, where we must not call Char_Sequence_To_UTF, but
+ -- instead just return the bracket unchanged.
+
+ if Encoding_Method = WCEM_Brackets
+ and then C = '['
+ and then (Ptr > Input'Last or else Input (Ptr) /= '"')
+ then
+ Result := '[';
+
+ -- All other cases including [" with Brackets
+
+ else
+ Result :=
+ Wide_Wide_Character'Val
+ (Char_Sequence_To_UTF (C, Encoding_Method));
+ end if;
+ end Non_UTF8;
+ end if;
+ end Decode_Wide_Wide_Character;
+
+ -----------------------------
+ -- Decode_Wide_Wide_String --
+ -----------------------------
+
+ function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
+ Result : Wide_Wide_String (1 .. S'Length);
+ Length : Natural;
+ begin
+ Decode_Wide_Wide_String (S, Result, Length);
+ return Result (1 .. Length);
+ end Decode_Wide_Wide_String;
+
+ procedure Decode_Wide_Wide_String
+ (S : String;
+ Result : out Wide_Wide_String;
+ Length : out Natural)
+ is
+ Ptr : Natural;
+
+ begin
+ Ptr := S'First;
+ Length := 0;
+ while Ptr <= S'Last loop
+ if Length >= Result'Last then
+ Past_End;
+ end if;
+
+ Length := Length + 1;
+ Decode_Wide_Wide_Character (S, Ptr, Result (Length));
+ end loop;
+ end Decode_Wide_Wide_String;
+
+ -------------------------
+ -- Next_Wide_Character --
+ -------------------------
+
+ procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
+ begin
+ if Ptr < Input'First then
+ Past_End;
+ end if;
+
+ -- Special efficient encoding for UTF-8 case
+
+ if Encoding_Method = WCEM_UTF8 then
+ UTF8 : declare
+ U : Unsigned_32;
+
+ procedure Getc;
+ pragma Inline (Getc);
+ -- Gets the character at Input (Ptr) and returns code in U as
+ -- Unsigned_32 value. On return Ptr is bumped past the character.
+
+ procedure Skip_UTF_Byte;
+ pragma Inline (Skip_UTF_Byte);
+ -- Skips past one encoded byte which must be 2#10xxxxxx#
+
+ ----------
+ -- Getc --
+ ----------
+
+ procedure Getc is
+ begin
+ if Ptr > Input'Last then
+ Past_End;
+ else
+ U := Unsigned_32 (Character'Pos (Input (Ptr)));
+ Ptr := Ptr + 1;
+ end if;
+ end Getc;
+
+ -------------------
+ -- Skip_UTF_Byte --
+ -------------------
+
+ procedure Skip_UTF_Byte is
+ begin
+ Getc;
+
+ if (U and 2#11000000#) /= 2#10_000000# then
+ Bad;
+ end if;
+ end Skip_UTF_Byte;
+
+ -- Start of processing for UTF-8 case
+
+ begin
+ -- 16#00_0000#-16#00_007F#: 0xxxxxxx
+
+ Getc;
+
+ if (U and 2#10000000#) = 2#00000000# then
+ return;
+
+ -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+ elsif (U and 2#11100000#) = 2#110_00000# then
+ Skip_UTF_Byte;
+
+ -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11110000#) = 2#1110_0000# then
+ Skip_UTF_Byte;
+ Skip_UTF_Byte;
+
+ -- Any other code is invalid, note that this includes:
+
+ -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+ -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+
+ -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx 10xxxxxx
+
+ -- since Wide_Character does not allow codes > 16#FFFF#
+
+ else
+ Bad;
+ end if;
+ end UTF8;
+
+ -- Non-UTF-8 cass
+
+ else
+ declare
+ Discard : Wide_Character;
+ begin
+ Decode_Wide_Character (Input, Ptr, Discard);
+ end;
+ end if;
+ end Next_Wide_Character;
+
+ ------------------------------
+ -- Next_Wide_Wide_Character --
+ ------------------------------
+
+ procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
+ begin
+ -- Special efficient encoding for UTF-8 case
+
+ if Encoding_Method = WCEM_UTF8 then
+ UTF8 : declare
+ U : Unsigned_32;
+
+ procedure Getc;
+ pragma Inline (Getc);
+ -- Gets the character at Input (Ptr) and returns code in U as
+ -- Unsigned_32 value. On return Ptr is bumped past the character.
+
+ procedure Skip_UTF_Byte;
+ pragma Inline (Skip_UTF_Byte);
+ -- Skips past one encoded byte which must be 2#10xxxxxx#
+
+ ----------
+ -- Getc --
+ ----------
+
+ procedure Getc is
+ begin
+ if Ptr > Input'Last then
+ Past_End;
+ else
+ U := Unsigned_32 (Character'Pos (Input (Ptr)));
+ Ptr := Ptr + 1;
+ end if;
+ end Getc;
+
+ -------------------
+ -- Skip_UTF_Byte --
+ -------------------
+
+ procedure Skip_UTF_Byte is
+ begin
+ Getc;
+
+ if (U and 2#11000000#) /= 2#10_000000# then
+ Bad;
+ end if;
+ end Skip_UTF_Byte;
+
+ -- Start of processing for UTF-8 case
+
+ begin
+ if Ptr < Input'First then
+ Past_End;
+ end if;
+
+ -- 16#00_0000#-16#00_007F#: 0xxxxxxx
+
+ Getc;
+
+ if (U and 2#10000000#) = 2#00000000# then
+ null;
+
+ -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+ elsif (U and 2#11100000#) = 2#110_00000# then
+ Skip_UTF_Byte;
+
+ -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11110000#) = 2#1110_0000# then
+ Skip_UTF_Byte;
+ Skip_UTF_Byte;
+
+ -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11111000#) = 2#11110_000# then
+ for K in 1 .. 3 loop
+ Skip_UTF_Byte;
+ end loop;
+
+ -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+
+ elsif (U and 2#11111100#) = 2#111110_00# then
+ for K in 1 .. 4 loop
+ Skip_UTF_Byte;
+ end loop;
+
+ -- Any other code is invalid, note that this includes:
+
+ -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx 10xxxxxx
+
+ -- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
+
+ else
+ Bad;
+ end if;
+ end UTF8;
+
+ -- Non-UTF-8 cass
+
+ else
+ declare
+ Discard : Wide_Wide_Character;
+ begin
+ Decode_Wide_Wide_Character (Input, Ptr, Discard);
+ end;
+ end if;
+ end Next_Wide_Wide_Character;
+
+ --------------
+ -- Past_End --
+ --------------
+
+ procedure Past_End is
+ begin
+ raise Constraint_Error with "past end of string";
+ end Past_End;
+
+ -------------------------
+ -- Prev_Wide_Character --
+ -------------------------
+
+ procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
+ begin
+ if Ptr > Input'Last + 1 then
+ Past_End;
+ end if;
+
+ -- Special efficient encoding for UTF-8 case
+
+ if Encoding_Method = WCEM_UTF8 then
+ UTF8 : declare
+ U : Unsigned_32;
+
+ procedure Getc;
+ pragma Inline (Getc);
+ -- Gets the character at Input (Ptr - 1) and returns code in U as
+ -- Unsigned_32 value. On return Ptr is decremented by one.
+
+ procedure Skip_UTF_Byte;
+ pragma Inline (Skip_UTF_Byte);
+ -- Checks that U is 2#10xxxxxx# and then calls Get
+
+ ----------
+ -- Getc --
+ ----------
+
+ procedure Getc is
+ begin
+ if Ptr <= Input'First then
+ Past_End;
+ else
+ Ptr := Ptr - 1;
+ U := Unsigned_32 (Character'Pos (Input (Ptr)));
+ end if;
+ end Getc;
+
+ -------------------
+ -- Skip_UTF_Byte --
+ -------------------
+
+ procedure Skip_UTF_Byte is
+ begin
+ if (U and 2#11000000#) = 2#10_000000# then
+ Getc;
+ else
+ Bad;
+ end if;
+ end Skip_UTF_Byte;
+
+ -- Start of processing for UTF-8 case
+
+ begin
+ -- 16#00_0000#-16#00_007F#: 0xxxxxxx
+
+ Getc;
+
+ if (U and 2#10000000#) = 2#00000000# then
+ return;
+
+ -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+ else
+ Skip_UTF_Byte;
+
+ if (U and 2#11100000#) = 2#110_00000# then
+ return;
+
+ -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+ else
+ Skip_UTF_Byte;
+
+ if (U and 2#11110000#) = 2#1110_0000# then
+ return;
+
+ -- Any other code is invalid, note that this includes:
+
+ -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
+ -- 10xxxxxx
+
+ -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+ -- 10xxxxxx
+
+ -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+
+ -- since Wide_Character does not allow codes > 16#FFFF#
+
+ else
+ Bad;
+ end if;
+ end if;
+ end if;
+ end UTF8;
+
+ -- Special efficient encoding for brackets case
+
+ elsif Encoding_Method = WCEM_Brackets then
+ Brackets : declare
+ P : Natural;
+ S : Natural;
+
+ begin
+ -- See if we have "] at end positions
+
+ if Ptr > Input'First + 1
+ and then Input (Ptr - 1) = ']'
+ and then Input (Ptr - 2) = '"'
+ then
+ P := Ptr - 2;
+
+ -- Loop back looking for [" at start
+
+ while P >= Ptr - 10 loop
+ if P <= Input'First + 1 then
+ Bad;
+
+ elsif Input (P - 1) = '"'
+ and then Input (P - 2) = '['
+ then
+ -- Found ["..."], scan forward to check it
+
+ S := P - 2;
+ P := S;
+ Next_Wide_Character (Input, P);
+
+ -- OK if at original pointer, else error
+
+ if P = Ptr then
+ Ptr := S;
+ return;
+ else
+ Bad;
+ end if;
+ end if;
+
+ P := P - 1;
+ end loop;
+
+ -- Falling through loop means more than 8 chars between the
+ -- enclosing brackets (or simply a missing left bracket)
+
+ Bad;
+
+ -- Here if no bracket sequence present
+
+ else
+ if Ptr = Input'First then
+ Past_End;
+ else
+ Ptr := Ptr - 1;
+ end if;
+ end if;
+ end Brackets;
+
+ -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
+ -- go to the start of the string and skip forwards till Ptr matches.
+
+ else
+ Non_UTF_Brackets : declare
+ Discard : Wide_Character;
+ PtrS : Natural;
+ PtrP : Natural;
+
+ begin
+ PtrS := Input'First;
+
+ if Ptr <= PtrS then
+ Past_End;
+ end if;
+
+ loop
+ PtrP := PtrS;
+ Decode_Wide_Character (Input, PtrS, Discard);
+
+ if PtrS = Ptr then
+ Ptr := PtrP;
+ return;
+
+ elsif PtrS > Ptr then
+ Bad;
+ end if;
+ end loop;
+
+ exception
+ when Constraint_Error =>
+ Bad;
+ end Non_UTF_Brackets;
+ end if;
+ end Prev_Wide_Character;
+
+ ------------------------------
+ -- Prev_Wide_Wide_Character --
+ ------------------------------
+
+ procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
+ begin
+ if Ptr > Input'Last + 1 then
+ Past_End;
+ end if;
+
+ -- Special efficient encoding for UTF-8 case
+
+ if Encoding_Method = WCEM_UTF8 then
+ UTF8 : declare
+ U : Unsigned_32;
+
+ procedure Getc;
+ pragma Inline (Getc);
+ -- Gets the character at Input (Ptr - 1) and returns code in U as
+ -- Unsigned_32 value. On return Ptr is decremented by one.
+
+ procedure Skip_UTF_Byte;
+ pragma Inline (Skip_UTF_Byte);
+ -- Checks that U is 2#10xxxxxx# and then calls Get
+
+ ----------
+ -- Getc --
+ ----------
+
+ procedure Getc is
+ begin
+ if Ptr <= Input'First then
+ Past_End;
+ else
+ Ptr := Ptr - 1;
+ U := Unsigned_32 (Character'Pos (Input (Ptr)));
+ end if;
+ end Getc;
+
+ -------------------
+ -- Skip_UTF_Byte --
+ -------------------
+
+ procedure Skip_UTF_Byte is
+ begin
+ if (U and 2#11000000#) = 2#10_000000# then
+ Getc;
+ else
+ Bad;
+ end if;
+ end Skip_UTF_Byte;
+
+ -- Start of processing for UTF-8 case
+
+ begin
+ -- 16#00_0000#-16#00_007F#: 0xxxxxxx
+
+ Getc;
+
+ if (U and 2#10000000#) = 2#00000000# then
+ return;
+
+ -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+ else
+ Skip_UTF_Byte;
+
+ if (U and 2#11100000#) = 2#110_00000# then
+ return;
+
+ -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
+
+ else
+ Skip_UTF_Byte;
+
+ if (U and 2#11110000#) = 2#1110_0000# then
+ return;
+
+ -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
+ -- 10xxxxxx
+
+ else
+ Skip_UTF_Byte;
+
+ if (U and 2#11111000#) = 2#11110_000# then
+ return;
+
+ -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+ -- 10xxxxxx
+
+ else
+ Skip_UTF_Byte;
+
+ if (U and 2#11111100#) = 2#111110_00# then
+ return;
+
+ -- Any other code is invalid, note that this includes:
+
+ -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+
+ -- since Wide_Wide_Character does not allow codes
+ -- greater than 16#03FF_FFFF#
+
+ else
+ Bad;
+ end if;
+ end if;
+ end if;
+ end if;
+ end if;
+ end UTF8;
+
+ -- Special efficient encoding for brackets case
+
+ elsif Encoding_Method = WCEM_Brackets then
+ Brackets : declare
+ P : Natural;
+ S : Natural;
+
+ begin
+ -- See if we have "] at end positions
+
+ if Ptr > Input'First + 1
+ and then Input (Ptr - 1) = ']'
+ and then Input (Ptr - 2) = '"'
+ then
+ P := Ptr - 2;
+
+ -- Loop back looking for [" at start
+
+ while P >= Ptr - 10 loop
+ if P <= Input'First + 1 then
+ Bad;
+
+ elsif Input (P - 1) = '"'
+ and then Input (P - 2) = '['
+ then
+ -- Found ["..."], scan forward to check it
+
+ S := P - 2;
+ P := S;
+ Next_Wide_Wide_Character (Input, P);
+
+ -- OK if at original pointer, else error
+
+ if P = Ptr then
+ Ptr := S;
+ return;
+ else
+ Bad;
+ end if;
+ end if;
+
+ P := P - 1;
+ end loop;
+
+ -- Falling through loop means more than 8 chars between the
+ -- enclosing brackets (or simply a missing left bracket)
+
+ Bad;
+
+ -- Here if no bracket sequence present
+
+ else
+ if Ptr = Input'First then
+ Past_End;
+ else
+ Ptr := Ptr - 1;
+ end if;
+ end if;
+ end Brackets;
+
+ -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
+ -- go to the start of the string and skip forwards till Ptr matches.
+
+ else
+ Non_UTF8_Brackets : declare
+ Discard : Wide_Wide_Character;
+ PtrS : Natural;
+ PtrP : Natural;
+
+ begin
+ PtrS := Input'First;
+
+ if Ptr <= PtrS then
+ Past_End;
+ end if;
+
+ loop
+ PtrP := PtrS;
+ Decode_Wide_Wide_Character (Input, PtrS, Discard);
+
+ if PtrS = Ptr then
+ Ptr := PtrP;
+ return;
+
+ elsif PtrS > Ptr then
+ Bad;
+ end if;
+ end loop;
+
+ exception
+ when Constraint_Error =>
+ Bad;
+ end Non_UTF8_Brackets;
+ end if;
+ end Prev_Wide_Wide_Character;
+
+ --------------------------
+ -- Validate_Wide_String --
+ --------------------------
+
+ function Validate_Wide_String (S : String) return Boolean is
+ Ptr : Natural;
+
+ begin
+ Ptr := S'First;
+ while Ptr <= S'Last loop
+ Next_Wide_Character (S, Ptr);
+ end loop;
+
+ return True;
+
+ exception
+ when Constraint_Error =>
+ return False;
+ end Validate_Wide_String;
+
+ -------------------------------
+ -- Validate_Wide_Wide_String --
+ -------------------------------
+
+ function Validate_Wide_Wide_String (S : String) return Boolean is
+ Ptr : Natural;
+
+ begin
+ Ptr := S'First;
+ while Ptr <= S'Last loop
+ Next_Wide_Wide_Character (S, Ptr);
+ end loop;
+
+ return True;
+
+ exception
+ when Constraint_Error =>
+ return False;
+ end Validate_Wide_Wide_String;
+
+end GNAT.Decode_String;
diff --git a/gcc/ada/g-decstr.ads b/gcc/ada/g-decstr.ads
new file mode 100755
index 00000000000..07d501552e0
--- /dev/null
+++ b/gcc/ada/g-decstr.ads
@@ -0,0 +1,163 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- G N A T . D E C O D E _ S T R I N G --
+-- --
+-- S p e c --
+-- --
+-- Copyright (C) 2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+-- This generic package provides utility routines for converting from an
+-- encoded string to a corresponding Wide_String or Wide_Wide_String value
+-- using a specified encoding convention, which is supplied as the generic
+-- parameter. UTF-8 is handled especially efficiently, and if the encoding
+-- method is known at compile time to be WCEM_UTF8, then the instantiation
+-- is specialized to handle only the UTF-8 case and exclude code for the
+-- other encoding methods. The package also provides positioning routines
+-- for skipping encoded characters in either direction, and for validating
+-- strings for correct encodings.
+
+-- Note: this package is only about decoding sequences of 8-bit characters
+-- into corresponding 16-bit Wide_String or 32-bit Wide_Wide_String values.
+-- It knows nothing at all about the character encodings being used for the
+-- resulting Wide_Character and Wide_Wide_Character values. Most often this
+-- will be Unicode/ISO-10646 as specified by the Ada RM, but this package
+-- does not make any assumptions about the character coding. See also the
+-- packages Ada.Wide_[Wide_]Characters.Unicode for unicode specific functions.
+
+-- Note on the use of brackets encoding (WCEM_Brackets). The brackets encoding
+-- method is ambiguous in the context of this package, since there is no way
+-- to tell if ["1234"] is eight unencoded characters or one encoded character.
+-- In the context of Ada sources, any sequence starting [" must be the start
+-- of an encoding (since that sequence is not valid in Ada source otherwise).
+-- The routines in this package use the same approach. If the input string
+-- contains the sequence [" then this is assumed to be the start of a brackets
+-- encoding sequence, and if it does not match the syntax, an error is raised.
+-- In the case of the Prev functions, a sequence ending with "] is assumed to
+-- be a valid brackets sequence, and an error is raised if it is not.
+
+with System.WCh_Con;
+
+generic
+ Encoding_Method : System.WCh_Con.WC_Encoding_Method;
+
+package GNAT.Decode_String is
+ pragma Pure;
+
+ function Decode_Wide_String (S : String) return Wide_String;
+ pragma Inline (Decode_Wide_String);
+ -- Decode the given String, which is encoded using the indicated coding
+ -- method, returning the corresponding decoded Wide_String value. If S
+ -- contains a character code that cannot be represented with the given
+ -- encoding, then Constraint_Error is raised.
+
+ procedure Decode_Wide_String
+ (S : String;
+ Result : out Wide_String;
+ Length : out Natural);
+ -- Similar to the above function except that the result is stored in the
+ -- given Wide_String variable Result, starting at Result (Result'First). On
+ -- return, Length is set to the number of characters stored in Result. The
+ -- caller must ensure that Result is long enough (an easy choice is to set
+ -- the length equal to the S'Length, since decoding can never increase the
+ -- string length). If the length of Result is insufficient Constraint_Error
+ -- will be raised.
+
+ function Decode_Wide_Wide_String (S : String) return Wide_Wide_String;
+ pragma Inline (Decode_Wide_Wide_String);
+ -- Same as above function but for Wide_Wide_String output
+
+ procedure Decode_Wide_Wide_String
+ (S : String;
+ Result : out Wide_Wide_String;
+ Length : out Natural);
+ -- Same as above procedure, but for Wide_Wide_String output
+
+ function Validate_Wide_String (S : String) return Boolean;
+ -- This function inspects the string S to determine if it contains only
+ -- valid encodings corresponding to Wide_Character values using the
+ -- given encoding. If a call to Decode_Wide_String (S) would return
+ -- without raising Constraint_Error, then Validate_Wide_String will
+ -- return True. If the call would have raised Constraint_Error, then
+ -- Validate_Wide_String will return False.
+
+ function Validate_Wide_Wide_String (S : String) return Boolean;
+ -- Similar to Validate_Wide_String, except that it succeeds if the string
+ -- contains only encodings corresponding to Wide_Wide_Character values.
+
+ procedure Decode_Wide_Character
+ (Input : String;
+ Ptr : in out Natural;
+ Result : out Wide_Character);
+ pragma Inline (Decode_Wide_Character);
+ -- This is a lower level procedure that decodes a single character using
+ -- the given encoding method. The encoded character is stored in Input,
+ -- starting at Input (Ptr). The resulting output character is stored in
+ -- Result, and on return Ptr is updated past the input character or
+ -- encoding sequence. Constraint_Error will be raised if the input has
+ -- has a character that cannot be represented using the given encoding,
+ -- or if Ptr is outside the bounds of the Input string.
+
+ procedure Decode_Wide_Wide_Character
+ (Input : String;
+ Ptr : in out Natural;
+ Result : out Wide_Wide_Character);
+ -- Same as above procedure but with Wide_Wide_Character input
+
+ procedure Next_Wide_Character (Input : String; Ptr : in out Natural);
+ -- This procedure examines the input string starting at Input (Ptr), and
+ -- advances Ptr past one character in the encoded string, so that on return
+ -- Ptr points to the next encoded character. Constraint_Error is raised if
+ -- an invalid encoding is encountered, or the end of the string is reached
+ -- or if Ptr is less than String'First on entry, or if the character
+ -- skipped is not a valid Wide_Character code. This call may be more
+ -- efficient than calling Decode_Wide_Character and discarding the result.
+
+ procedure Prev_Wide_Character (Input : String; Ptr : in out Natural);
+ -- This procedure is similar to Next_Encoded_Character except that it moves
+ -- backwards in the string, so that on return, Ptr is set to point to the
+ -- previous encoded character. Constraint_Error is raised if the start of
+ -- the string is encountered. It is valid for Ptr to be one past the end
+ -- of the string for this call (in which case on return it will point to
+ -- the last encoded character).
+ --
+ -- Note: it is not generally possible to do this function efficiently with
+ -- all encodings, the current implementation is only efficient for the case
+ -- of UTF-8 (Encoding_Method = WCEM_UTF8) and Brackets (Encoding_Method =
+ -- WCEM_Brackets). For all other encodings, we work by starting at the
+ -- beginning of the string and moving forward till Ptr is reached, which
+ -- is correct but slow.
+
+ procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural);
+ -- Similar to Next_Wide_Character except that codes skipped must be valid
+ -- Wide_Wide_Character codes.
+
+ procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural);
+ -- Similar to Prev_Wide_Character except that codes skipped must be valid
+ -- Wide_Wide_Character codes.
+
+end GNAT.Decode_String;
diff --git a/gcc/ada/g-deutst.ads b/gcc/ada/g-deutst.ads
new file mode 100644
index 00000000000..ca03ace5260
--- /dev/null
+++ b/gcc/ada/g-deutst.ads
@@ -0,0 +1,45 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- G N A T . D E C O D E _ U T F 8 _ S T R I N G --
+-- --
+-- S p e c --
+-- --
+-- Copyright (C) 2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+-- This package provides a pre-instantiation of GNAT.Decode_String for the
+-- common case of UTF-8 encoding. As noted in the documentation of that
+-- package, this UTF-8 instantiation is efficient and specialized so that
+-- it has only the code for the UTF-8 case. See g-decstr.ads for full
+-- documentation on this package.
+
+with GNAT.Decode_String;
+
+with System.WCh_Con;
+
+package GNAT.Decode_UTF8_String is
+ new GNAT.Decode_String (System.WCh_Con.WCEM_UTF8);
diff --git a/gcc/ada/g-encstr.adb b/gcc/ada/g-encstr.adb
new file mode 100755
index 00000000000..6f1411693fe
--- /dev/null
+++ b/gcc/ada/g-encstr.adb
@@ -0,0 +1,260 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- G N A T . E N C O D E _ S T R I N G --
+-- --
+-- B o d y --
+-- --
+-- Copyright (C) 2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+with Interfaces; use Interfaces;
+
+with System.WCh_Con; use System.WCh_Con;
+with System.WCh_Cnv; use System.WCh_Cnv;
+
+package body GNAT.Encode_String is
+
+ -----------------------
+ -- Local Subprograms --
+ -----------------------
+
+ procedure Bad;
+ pragma No_Return (Bad);
+ -- Raise error for bad character code
+
+ procedure Past_End;
+ pragma No_Return (Past_End);
+ -- Raise error for off end of string
+
+ ---------
+ -- Bad --
+ ---------
+
+ procedure Bad is
+ begin
+ raise Constraint_Error with
+ "character cannot be encoded with given Encoding_Method";
+ end Bad;
+
+ ------------------------
+ -- Encode_Wide_String --
+ ------------------------
+
+ function Encode_Wide_String (S : Wide_String) return String is
+ Long : constant Natural := WC_Longest_Sequences (Encoding_Method);
+ Result : String (1 .. S'Length * Long);
+ Length : Natural;
+ begin
+ Encode_Wide_String (S, Result, Length);
+ return Result (1 .. Length);
+ end Encode_Wide_String;
+
+ procedure Encode_Wide_String
+ (S : Wide_String;
+ Result : out String;
+ Length : out Natural)
+ is
+ Ptr : Natural;
+
+ begin
+ Ptr := S'First;
+ for J in S'Range loop
+ Encode_Wide_Character (S (J), Result, Ptr);
+ end loop;
+
+ Length := Ptr - S'First;
+ end Encode_Wide_String;
+
+ -----------------------------
+ -- Encode_Wide_Wide_String --
+ -----------------------------
+
+ function Encode_Wide_Wide_String (S : Wide_Wide_String) return String is
+ Long : constant Natural := WC_Longest_Sequences (Encoding_Method);
+ Result : String (1 .. S'Length * Long);
+ Length : Natural;
+ begin
+ Encode_Wide_Wide_String (S, Result, Length);
+ return Result (1 .. Length);
+ end Encode_Wide_Wide_String;
+
+ procedure Encode_Wide_Wide_String
+ (S : Wide_Wide_String;
+ Result : out String;
+ Length : out Natural)
+ is
+ Ptr : Natural;
+
+ begin
+ Ptr := S'First;
+ for J in S'Range loop
+ Encode_Wide_Wide_Character (S (J), Result, Ptr);
+ end loop;
+
+ Length := Ptr - S'First;
+ end Encode_Wide_Wide_String;
+
+ ---------------------------
+ -- Encode_Wide_Character --
+ ---------------------------
+
+ procedure Encode_Wide_Character
+ (Char : Wide_Character;
+ Result : in out String;
+ Ptr : in out Natural)
+ is
+ begin
+ Encode_Wide_Wide_Character
+ (Wide_Wide_Character'Val (Wide_Character'Pos (Char)), Result, Ptr);
+
+ exception
+ when Constraint_Error =>
+ Bad;
+ end Encode_Wide_Character;
+
+ --------------------------------
+ -- Encode_Wide_Wide_Character --
+ --------------------------------
+
+ procedure Encode_Wide_Wide_Character
+ (Char : Wide_Wide_Character;
+ Result : in out String;
+ Ptr : in out Natural)
+ is
+ U : Unsigned_32;
+
+ procedure Out_Char (C : Character);
+ pragma Inline (Out_Char);
+ -- Procedure to store one character for instantiation below
+
+ --------------
+ -- Out_Char --
+ --------------
+
+ procedure Out_Char (C : Character) is
+ begin
+ if Ptr > Result'Last then
+ Past_End;
+ else
+ Result (Ptr) := C;
+ Ptr := Ptr + 1;
+ end if;
+ end Out_Char;
+
+ -- Start of processing for Encode_Wide_Wide_Character;
+
+ begin
+ -- Efficient code for UTF-8 case
+
+ if Encoding_Method = WCEM_UTF8 then
+
+ -- Note: for details of UTF8 encoding see RFC 3629
+
+ U := Unsigned_32 (Wide_Wide_Character'Pos (Char));
+
+ -- 16#00_0000#-16#00_007F#: 0xxxxxxx
+
+ if U <= 16#00_007F# then
+ Out_Char (Character'Val (U));
+
+ -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
+
+ elsif U <= 16#00_07FF# then
+ Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
+ Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+ -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
+
+ elsif U <= 16#00_FFFF# then
+ Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
+ Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
+ and 2#00111111#)));
+ Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+ -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+ elsif U <= 16#10_FFFF# then
+ Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
+ Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
+ and 2#00111111#)));
+ Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
+ and 2#00111111#)));
+ Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+ -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx
+
+ elsif U <= 16#03FF_FFFF# then
+ Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
+ Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
+ and 2#00111111#)));
+ Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
+ and 2#00111111#)));
+ Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
+ and 2#00111111#)));
+ Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
+
+ -- All other cases are invalid character codes, not this includes:
+
+ -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
+ -- 10xxxxxx 10xxxxxx 10xxxxxx
+
+ -- since Wide_Wide_Character values cannot exceed 16#3F_FFFF#
+
+ else
+ Bad;
+ end if;
+
+ -- All encoding methods other than UTF-8
+
+ else
+ Non_UTF8 : declare
+ procedure UTF_32_To_String is
+ new UTF_32_To_Char_Sequence (Out_Char);
+ -- Instantiate conversion procedure with above Out_Char routine
+
+ begin
+ UTF_32_To_String
+ (UTF_32_Code (Wide_Wide_Character'Pos (Char)), Encoding_Method);
+
+ exception
+ when Constraint_Error =>
+ Bad;
+ end Non_UTF8;
+ end if;
+ end Encode_Wide_Wide_Character;
+
+ --------------
+ -- Past_End --
+ --------------
+
+ procedure Past_End is
+ begin
+ raise Constraint_Error with "past end of string";
+ end Past_End;
+
+end GNAT.Encode_String;
diff --git a/gcc/ada/g-encstr.ads b/gcc/ada/g-encstr.ads
new file mode 100755
index 00000000000..5862bbe182d
--- /dev/null
+++ b/gcc/ada/g-encstr.ads
@@ -0,0 +1,111 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- G N A T . E N C O D E _ S T R I N G --
+-- --
+-- S p e c --
+-- --
+-- Copyright (C) 2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+-- This generic package provides utility routines for converting from
+-- Wide_String or Wide_Wide_String to encoded String using a specified
+-- encoding convention, which is supplied as the generic parameter. If
+-- this parameter is a known at compile time constant (e.g. a constant
+-- definned in System.WCh_Con), the instantiation is specialized so that
+-- it applies only to this specified coding.
+
+-- Note: this package is only about encoding sequences of 16- or 32-bit
+-- characters into a sequence of 8-bit codes. It knows nothing at all about
+-- the character encodings being used for the input Wide_Character and
+-- Wide_Wide_Character values, although some of the encoding methods (notably
+-- JIS and EUC) have built in assumptions about the range of possible input
+-- code values. Most often the input will be Unicode/ISO-10646 as specified by
+-- the Ada RM, but this package does not make any assumptions about the
+-- character coding, and in the case of UTF-8 all possible code values can be
+-- encoded. See also the packages Ada.Wide_[Wide_]Characters.Unicode for
+-- unicode specific functions.
+
+-- Note on brackets encoding (WCEM_Brackets). On input, upper half characters
+-- can be represented as ["hh"] but the routines in this package will only use
+-- brackets encodings for codes higher than 16#FF#, so upper half characters
+-- will be output as single Character values.
+
+with System.WCh_Con;
+
+generic
+ Encoding_Method : System.WCh_Con.WC_Encoding_Method;
+
+package GNAT.Encode_String is
+ pragma Pure;
+
+ function Encode_Wide_String (S : Wide_String) return String;
+ pragma Inline (Encode_Wide_String);
+ -- Encode the given Wide_String, returning a String encoded using the
+ -- given encoding method. Constraint_Error will be raised if the encoding
+ -- method cannot accomodate the input data.
+
+ procedure Encode_Wide_String
+ (S : Wide_String;
+ Result : out String;
+ Length : out Natural);
+ -- Encode the given Wide_String, storing the encoded string in Result,
+ -- with Length being set to the length of the encoded string. The caller
+ -- must ensure that Result is long enough (see useful constants defined
+ -- in System.WCh_Con: WC_Longest_Sequence, WC_Longest_Sequences). If the
+ -- length of Result is insufficient Constraint_Error will be raised.
+ -- Constraint_Error will also be raised if the encoding method cannot
+ -- accomodate the input data.
+
+ function Encode_Wide_Wide_String (S : Wide_Wide_String) return String;
+ pragma Inline (Encode_Wide_Wide_String);
+ -- Same as above function but for Wide_Wide_String input
+
+ procedure Encode_Wide_Wide_String
+ (S : Wide_Wide_String;
+ Result : out String;
+ Length : out Natural);
+ -- Same as above procedure, but for Wide_Wide_String input
+
+ procedure Encode_Wide_Character
+ (Char : Wide_Character;
+ Result : in out String;
+ Ptr : in out Natural);
+ pragma Inline (Encode_Wide_Character);
+ -- This is a lower level procedure that encodes the single character Char.
+ -- The output is stored in Result starting at Result (Ptr), and Ptr is
+ -- updated past the stored value. Constraint_Error is raised if Result
+ -- is not long enough to accomodate the result, or if the encoding method
+ -- specified does not accomodate the input character value, or if Ptr is
+ -- outside the bounds of the Result string.
+
+ procedure Encode_Wide_Wide_Character
+ (Char : Wide_Wide_Character;
+ Result : in out String;
+ Ptr : in out Natural);
+ -- Same as above procedure but with Wide_Wide_Character input
+
+end GNAT.Encode_String;
diff --git a/gcc/ada/g-enutst.ads b/gcc/ada/g-enutst.ads
new file mode 100644
index 00000000000..3c4632866bf
--- /dev/null
+++ b/gcc/ada/g-enutst.ads
@@ -0,0 +1,45 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- G N A T . E N C O D E _ U T F 8 _ S T R I N G --
+-- --
+-- S p e c --
+-- --
+-- Copyright (C) 2007, AdaCore --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 2, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
+-- for more details. You should have received a copy of the GNU General --
+-- Public License distributed with GNAT; see file COPYING. If not, write --
+-- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
+-- Boston, MA 02110-1301, USA. --
+-- --
+-- As a special exception, if other files instantiate generics from this --
+-- unit, or you link this unit with other files to produce an executable, --
+-- this unit does not by itself cause the resulting executable to be --
+-- covered by the GNU General Public License. This exception does not --
+-- however invalidate any other reasons why the executable file might be --
+-- covered by the GNU Public License. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+------------------------------------------------------------------------------
+
+-- This package provides a pre-instantiation of GNAT.Encode_String for the
+-- common case of UTF-8 encoding. As noted in the documentation of that
+-- package, this UTF-8 instantiation is efficient and specialized so that
+-- it has only the code for the UTF-8 case. See g-encstr.ads for full
+-- documentation on this package.
+
+with GNAT.Encode_String;
+
+with System.WCh_Con;
+
+package GNAT.Encode_UTF8_String is
+ new GNAT.Encode_String (System.WCh_Con.WCEM_UTF8);
diff --git a/gcc/ada/scn.adb b/gcc/ada/scn.adb
index f970d84ab98..d4e7bd387d9 100644
--- a/gcc/ada/scn.adb
+++ b/gcc/ada/scn.adb
@@ -28,6 +28,7 @@ with Csets; use Csets;
with Hostparm; use Hostparm;
with Namet; use Namet;
with Opt; use Opt;
+with Output; use Output;
with Restrict; use Restrict;
with Rident; use Rident;
with Scans; use Scans;
@@ -35,6 +36,10 @@ with Sinfo; use Sinfo;
with Sinput; use Sinput;
with Uintp; use Uintp;
+with GNAT.Byte_Order_Mark; use GNAT.Byte_Order_Mark;
+
+with System.WCh_Con; use System.WCh_Con;
+
package body Scn is
use ASCII;
@@ -266,6 +271,42 @@ package body Scn is
Set_License (Current_Source_File, Determine_License);
end if;
+ -- Check for BOM
+
+ declare
+ BOM : BOM_Kind;
+ Len : Natural;
+ Tst : String (1 .. 5);
+
+ begin
+ for J in 1 .. 5 loop
+ Tst (J) := Source (Scan_Ptr + Source_Ptr (J) - 1);
+ end loop;
+
+ Read_BOM (Tst, Len, BOM, False);
+
+ case BOM is
+ when UTF8_All =>
+ Scan_Ptr := Scan_Ptr + Source_Ptr (Len);
+ Wide_Character_Encoding_Method := WCEM_UTF8;
+ Upper_Half_Encoding := True;
+
+ when UTF16_LE | UTF16_BE =>
+ Write_Line ("UTF-16 encoding format not recognized");
+ raise Unrecoverable_Error;
+
+ when UTF32_LE | UTF32_BE =>
+ Write_Line ("UTF-32 encoding format not recognized");
+ raise Unrecoverable_Error;
+
+ when Unknown =>
+ null;
+
+ when others =>
+ raise Program_Error;
+ end case;
+ end;
+
-- Because of the License stuff above, Scng.Initialize_Scanner cannot
-- call Scan. Scan initial token (note this initializes Prev_Token,
-- Prev_Token_Ptr).
OpenPOWER on IntegriCloud