1 // Copyright Ferdinand Majerech 2014, Digital Mars 2000-2012, Andrei Alexandrescu 2008- and Jonathan M Davis 2011-.
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 
7 /// @nogc versions of or alternatives to Phobos functions that are not yet @nogc and
8 /// wrappers to simplify their use.
9 module dyaml.nogcutil;
10 
11 
12 
13 import std.traits;
14 import std.typecons;
15 import std.typetuple;
16 import std.range;
17 
18 
19 
20 /// A NoGC version of std.conv.parse for integer types.
21 ///
22 /// Differences:
23 ///    overflow parameter - bool set to true if there was integer overflow.
24 ///    Asserts that at least one character was parsed instead of throwing an exception.
25 ///    The caller must validate the inputs before calling parseNoGC.
26 Target parseNoGC(Target, Source)(ref Source s, uint radix, out bool overflow)
27     @safe pure nothrow @nogc
28     if (isSomeChar!(ElementType!Source) &&
29         isIntegral!Target && !is(Target == enum))
30 in { assert(radix >= 2 && radix <= 36); }
31 body
32 {
33     immutable uint beyond = (radix < 10 ? '0' : 'a'-10) + radix;
34 
35     Target v = 0;
36     size_t atStart = true;
37 
38     // We can safely foreach over individual code points.
39     // Even with UTF-8 any digit is ASCII and anything not ASCII (such as the start of
40     // a UTF-8 sequence) is not a digit.
41     foreach(i; 0 .. s.length)
42     {
43         dchar c = s[i];
44         // We can just take a char instead of decoding because anything non-ASCII is not
45         // going to be a decodable digit, i.e. we will end at such a byte.
46         if (c < '0' || c >= 0x80)
47             break;
48         if (radix < 10)
49         {
50             if (c >= beyond)
51                 break;
52         }
53         else
54         {
55             if (c > '9')
56             {
57                 c |= 0x20;//poorman's tolower
58                 if (c < 'a' || c >= beyond) { break; }
59                 c -= 'a'-10-'0';
60             }
61         }
62         auto blah = cast(Target) (v * radix + c - '0');
63         if (blah < v)
64         {
65             overflow = true;
66             return Target.max;
67         }
68         v = blah;
69         atStart = false;
70     }
71     assert(!atStart, "Nothing to parse in parse()");
72     return v;
73 }
74 
75 
76 /// Buils a message to a buffer similarly to writef/writefln, but without
77 /// using GC.
78 ///
79 /// C snprintf would be better, but it isn't pure.
80 /// formattedWrite isn't completely @nogc yet (although it isn't GC-heavy).
81 ///
82 /// The user has to ensure buffer is long enough - an assert checks that we don't run
83 /// out of space. Currently this can only write strings and dchars.
84 char[] printNoGC(S...)(char[] buffer, S args) @safe pure nothrow @nogc
85 {
86     auto appender = appenderNoGC(buffer);
87 
88     foreach(arg; args)
89     {
90         alias A = typeof(arg);
91         static if(is(A == char[]) || is(A == string)) { appender.put(arg); }
92         else static if(is(Unqual!A == dchar))         { appender.putDChar(arg); }
93         else static assert(false, "printNoGC does not support " ~ A.stringof);
94     }
95 
96     return appender.data;
97 }
98 
99 
100 /// A UFCS utility function to write a dchar to an AppenderNoGCFixed using writeDCharTo.
101 ///
102 /// The char $(B must) be a valid dchar.
103 void putDChar(ref AppenderNoGCFixed!(char[], char) appender, dchar c)
104     @safe pure nothrow @nogc
105 {
106     char[4] dcharBuf;
107     if(c < 0x80)
108     {
109         dcharBuf[0] = cast(char)c;
110         appender.put(dcharBuf[0 .. 1]);
111         return;
112     }
113     // Should be safe to use as the first thing Reader does is validate everything.
114     const bytes = encodeValidCharNoGC(dcharBuf, c);
115     appender.put(dcharBuf[0 .. bytes]);
116 }
117 
118 /// Convenience function that returns an $(D AppenderNoGCFixed!A) using with $(D array)
119 /// for storage.
120 AppenderNoGCFixed!(E[]) appenderNoGC(A : E[], E)(A array)
121 {
122     return AppenderNoGCFixed!(E[])(array);
123 }
124 
125 /// A gutted, NoGC version of std.array.appender.
126 ///
127 /// Works on a fixed-size buffer.
128 struct AppenderNoGCFixed(A : T[], T)
129 {
130     import std.array;
131 
132     private struct Data
133     {
134         size_t capacity;
135         Unqual!T[] arr;
136         bool canExtend = false;
137     }
138 
139     private Data _data;
140 
141     @nogc:
142 
143     /// Construct an appender that will work with given buffer.
144     ///
145     /// Data written to the appender will overwrite the buffer from the start.
146     this(T[] arr) @trusted pure nothrow
147     {
148         // initialize to a given array.
149         _data.arr = cast(Unqual!T[])arr[0 .. 0]; //trusted
150         _data.capacity = arr.length;
151     }
152 
153     /**
154      * Returns the capacity of the array (the maximum number of elements the
155      * managed array can accommodate before triggering a reallocation).  If any
156      * appending will reallocate, $(D capacity) returns $(D 0).
157      */
158     @property size_t capacity() const @safe pure nothrow
159     {
160         return _data.capacity;
161     }
162 
163     /**
164      * Returns the managed array.
165      */
166     @property inout(T)[] data() inout @trusted pure nothrow
167     {
168         /* @trusted operation:
169          * casting Unqual!T[] to inout(T)[]
170          */
171         return cast(typeof(return))(_data.arr);
172     }
173 
174     // ensure we can add nelems elements, resizing as necessary
175     private void ensureAddable(size_t nelems) @safe pure nothrow
176     {
177         assert(_data.capacity >= _data.arr.length + nelems,
178                 "AppenderFixed ran out of space");
179     }
180 
181     void put(U)(U[] items) if (is(Unqual!U == T))
182     {
183         // make sure we have enough space, then add the items
184         ensureAddable(items.length);
185         immutable len = _data.arr.length;
186         immutable newlen = len + items.length;
187 
188         auto bigDataFun() @trusted nothrow { return _data.arr.ptr[0 .. newlen];}
189         auto bigData = bigDataFun();
190 
191         alias UT = Unqual!T;
192 
193         bigData[len .. newlen] = items[];
194 
195         //We do this at the end, in case of exceptions
196         _data.arr = bigData;
197     }
198 
199     // only allow overwriting data on non-immutable and non-const data
200     static if (isMutable!T)
201     {
202         /**
203          * Clears the managed array.  This allows the elements of the array to be reused
204          * for appending.
205          *
206          * Note that clear is disabled for immutable or const element types, due to the
207          * possibility that $(D AppenderNoGCFixed) might overwrite immutable data.
208          */
209         void clear() @safe pure nothrow
210         {
211             _data.arr = ()@trusted{ return _data.arr.ptr[0 .. 0]; }();
212         }
213     }
214     else
215     {
216         /// Clear is not available for const/immutable data.
217         @disable void clear();
218     }
219 }
220 unittest
221 {
222     char[256] buffer;
223     auto appender = appenderNoGC(buffer[]);
224     appender.put("found unsupported escape character: ");
225     appender.putDChar('a');
226     appender.putDChar('á');
227     assert(appender.data == "found unsupported escape character: aá");
228 }
229 
230 
231 /// Result of a validateUTF8NoGC call.
232 struct ValidateResult
233 {
234     /// Is the validated string valid?
235     bool   valid;
236     /// Number of characters in the string.
237     ///
238     /// If the string is not valid, this is the number of valid characters before
239     /// hitting the first invalid sequence.
240     size_t characterCount;
241     /// If the string is not valid, error message with details is here.
242     string msg;
243 }
244 
245 /// Validate a UTF-8 string, checking if it is well-formed Unicode.
246 ///
247 /// See_Also: ValidateResult
248 ValidateResult validateUTF8NoGC(const(char[]) str) @trusted pure nothrow @nogc
249 {
250     immutable len = str.length;
251     size_t characterCount;
252     outer: for (size_t index = 0; index < len; )
253     {
254         if(str[index] < 0x80)
255         {
256             ++index;
257             ++characterCount;
258             continue;
259         }
260 
261         auto decoded = decodeUTF8NoGC!(No.validated)(str, index);
262         if(decoded.errorMessage !is null)
263         {
264             return ValidateResult(false, characterCount, decoded.errorMessage);
265         }
266         ++characterCount;
267     }
268 
269     return ValidateResult(true, characterCount);
270 }
271 
272 /// @nogc version of std.utf.decode() for char[].
273 ///
274 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
275 /// force code using this function to be efficient.
276 ///
277 /// Params:
278 ///
279 /// validated = If ture, assume str is a valid UTF-8 string and don't generate any 
280 ///             error-checking code. If validated is true, str $(B must) be a valid
281 ///             character, otherwise undefined behavior will occur. Also affects the
282 ///             return type.
283 /// str       = Will decode the first code point from this string. 
284 /// index     = Index in str where the code point starts. Will be updated to point to
285 ///             the next code point.
286 ///
287 /// Returns: If validated is true, the decoded character.
288 ///          Otherwise a struct with a 'decoded' member - the decoded character, and a 
289 ///          'string errorMessage' member that is null on success and otherwise stores
290 ///          the error message.
291 auto decodeUTF8NoGC(Flag!"validated" validated)(const(char[]) str, ref size_t index)
292     @trusted pure nothrow @nogc
293 {
294     static if(!validated) struct Result
295     {
296         dchar decoded;
297         string errorMessage;
298     }
299     else alias Result = dchar;
300 
301     /// Dchar bitmask for different numbers of UTF-8 code units.
302     enum bitMask     = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
303 
304     auto pstr = str.ptr + index;
305 
306     immutable length = str.length - index;
307     ubyte fst = pstr[0];
308 
309     assert(fst & 0x80);
310     enum invalidUTFMsg = "Invalid UTF-8 sequence";
311     static if(!validated) { enum invalidUTF = Result(cast(dchar)int.max, invalidUTFMsg); }
312 
313     // starter must have at least 2 first bits set
314     static if(validated)
315     {
316         assert((fst & 0b1100_0000) == 0b1100_0000, invalidUTFMsg);
317     }
318     else if((fst & 0b1100_0000) != 0b1100_0000)
319     {
320         return invalidUTF;
321     }
322 
323     ubyte tmp = void;
324     dchar d = fst; // upper control bits are masked out later
325     fst <<= 1;
326 
327 
328     foreach (i; TypeTuple!(1, 2, 3))
329     {
330         static if(validated) { assert(i != length, "Decoding out of bounds"); }
331         else if(i == length) { return Result(cast(dchar)int.max, "Decoding out of bounds"); }
332 
333         tmp = pstr[i];
334         static if(validated)          { assert((tmp & 0xC0) == 0x80, invalidUTFMsg); }
335         else if((tmp & 0xC0) != 0x80) { return invalidUTF; }
336 
337         d = (d << 6) | (tmp & 0x3F);
338         fst <<= 1;
339 
340         if (!(fst & 0x80)) // no more bytes
341         {
342             d &= bitMask[i]; // mask out control bits
343 
344             // overlong, could have been encoded with i bytes
345             static if(validated) { assert((d & ~bitMask[i - 1]) != 0, invalidUTFMsg); }
346             else if((d & ~bitMask[i - 1]) == 0) { return invalidUTF; }
347 
348             // check for surrogates only needed for 3 bytes
349             static if (i == 2)
350             {
351                 static if(validated)      { assert(isValidDchar(d), invalidUTFMsg); }
352                 else if(!isValidDchar(d)) { return invalidUTF; }
353             }
354 
355             index += i + 1;
356             static if (i == 3)
357             {
358                 static if(validated)   { assert(d <= dchar.max, invalidUTFMsg); }
359                 else if(d > dchar.max) { return invalidUTF; }
360             }
361 
362             return Result(d);
363         }
364     }
365 
366     static if(validated) { assert(false, invalidUTFMsg); }
367     else                 { return invalidUTF; }
368 }
369 
370 /// @nogc version of std.utf.decode() for char[], but assumes str is valid UTF-8.
371 ///
372 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
373 /// force code using this function to be efficient.
374 ///
375 /// Params:
376 ///
377 /// str   = Will decode the first code point from this string. Must be valid UTF-8,
378 ///         otherwise undefined behavior WILL occur.
379 /// index = Index in str where the code point starts. Will be updated to point to the
380 ///         next code point.
381 alias decodeValidUTF8NoGC = decodeUTF8NoGC!(Yes.validated);
382 
383 /// @nogc version of std.utf.encode() for char[].
384 ///
385 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
386 /// force code using this function to be efficient.
387 ///
388 /// Params:
389 /// validated = If true, asssume c is a valid, non-surrogate UTF-32 code point and don't
390 ///             generate any error-checking code. If validated is true, c $(B must) be
391 ///             a valid character, otherwise undefined behavior will occur. Also affects
392 ///             the return type.
393 /// buf       = Buffer to write the encoded result to.
394 /// c         = Character to encode.
395 ///
396 /// Returns: If validated is true, number of bytes the encoded character takes up in buf.
397 ///          Otherwise a struct with a 'bytes' member specifying the number of bytes of
398 ///          the endocded character, and a 'string errorMessage' member that is null
399 ///          if there was no error and otherwise stores the error message.
400 auto encodeCharNoGC(Flag!"validated" validated)(ref char[4] buf, dchar c)
401     @safe pure nothrow @nogc
402 {
403     static if(!validated) struct Result
404     {
405         size_t bytes;
406         string errorMessage;
407     }
408     else alias Result = size_t;
409 
410     // Force the caller to optimize ASCII (the 1-byte case)
411     assert(c >= 0x80, "Caller should explicitly handle ASCII chars");
412     if (c <= 0x7FF)
413     {
414         assert(isValidDchar(c));
415         buf[0] = cast(char)(0xC0 | (c >> 6));
416         buf[1] = cast(char)(0x80 | (c & 0x3F));
417         return Result(2);
418     }
419     if (c <= 0xFFFF)
420     {
421         static if(validated)
422         {
423             assert(0xD800 > c || c > 0xDFFF,
424                    "Supposedly valid code point is a surrogate code point");
425         }
426         else if(0xD800 <= c && c <= 0xDFFF)
427         {
428             return Result(size_t.max, "Can't encode a surrogate code point in UTF-8");
429         }
430 
431         assert(isValidDchar(c));
432         buf[0] = cast(char)(0xE0 | (c >> 12));
433         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
434         buf[2] = cast(char)(0x80 | (c & 0x3F));
435         return Result(3);
436     }
437     if (c <= 0x10FFFF)
438     {
439         assert(isValidDchar(c));
440         buf[0] = cast(char)(0xF0 | (c >> 18));
441         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
442         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
443         buf[3] = cast(char)(0x80 | (c & 0x3F));
444         return Result(4);
445     }
446 
447     assert(!isValidDchar(c));
448     static if(!validated)
449     {
450         return Result(size_t.max, "Can't encode an invalid code point in UTF-8");
451     }
452     else
453     {
454         assert(false, "Supposedly valid code point is invalid");
455     }
456 }
457 
458 /// @nogc version of std.utf.encode() for char[], but assumes c is a valid UTF-32 char.
459 ///
460 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
461 /// force code using this function to be efficient.
462 ///
463 /// Params:
464 ///
465 /// buf = Buffer to write the encoded result to.
466 /// c   = Character to encode. Must be valid UTF-32, otherwise undefined behavior
467 ///       $(D will) occur.
468 ///
469 /// Returns: Number of bytes the encoded character takes up in buf.
470 alias encodeValidCharNoGC = encodeCharNoGC!(Yes.validated);
471 
472 /// @nogc version of std.utf.isValidDchar
473 bool isValidDchar(dchar c) @safe pure nothrow @nogc
474 {
475     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
476 }