1 // Copyright Ferdinand Majerech 2014, Digital Mars 2000-2012, Andrei Alexandrescu 2008- and Jonathan M Davis 2011-. 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 7 /// @nogc versions of or alternatives to Phobos functions that are not yet @nogc and 8 /// wrappers to simplify their use. 9 module dyaml.nogcutil; 10 11 12 13 import std.traits; 14 import std.typecons; 15 import std.typetuple; 16 import std.range; 17 18 19 20 /// A NoGC version of std.conv.parse for integer types. 21 /// 22 /// Differences: 23 /// overflow parameter - bool set to true if there was integer overflow. 24 /// Asserts that at least one character was parsed instead of throwing an exception. 25 /// The caller must validate the inputs before calling parseNoGC. 26 Target parseNoGC(Target, Source)(ref Source s, uint radix, out bool overflow) 27 @safe pure nothrow @nogc 28 if (isSomeChar!(ElementType!Source) && 29 isIntegral!Target && !is(Target == enum)) 30 in { assert(radix >= 2 && radix <= 36); } 31 body 32 { 33 immutable uint beyond = (radix < 10 ? '0' : 'a'-10) + radix; 34 35 Target v = 0; 36 size_t atStart = true; 37 38 // We can safely foreach over individual code points. 39 // Even with UTF-8 any digit is ASCII and anything not ASCII (such as the start of 40 // a UTF-8 sequence) is not a digit. 41 foreach(i; 0 .. s.length) 42 { 43 dchar c = s[i]; 44 // We can just take a char instead of decoding because anything non-ASCII is not 45 // going to be a decodable digit, i.e. we will end at such a byte. 46 if (c < '0' || c >= 0x80) 47 break; 48 if (radix < 10) 49 { 50 if (c >= beyond) 51 break; 52 } 53 else 54 { 55 if (c > '9') 56 { 57 c |= 0x20;//poorman's tolower 58 if (c < 'a' || c >= beyond) { break; } 59 c -= 'a'-10-'0'; 60 } 61 } 62 auto blah = cast(Target) (v * radix + c - '0'); 63 if (blah < v) 64 { 65 overflow = true; 66 return Target.max; 67 } 68 v = blah; 69 atStart = false; 70 } 71 assert(!atStart, "Nothing to parse in parse()"); 72 return v; 73 } 74 75 76 /// Buils a message to a buffer similarly to writef/writefln, but without 77 /// using GC. 78 /// 79 /// C snprintf would be better, but it isn't pure. 80 /// formattedWrite isn't completely @nogc yet (although it isn't GC-heavy). 81 /// 82 /// The user has to ensure buffer is long enough - an assert checks that we don't run 83 /// out of space. Currently this can only write strings and dchars. 84 char[] printNoGC(S...)(char[] buffer, S args) @safe pure nothrow @nogc 85 { 86 auto appender = appenderNoGC(buffer); 87 88 foreach(arg; args) 89 { 90 alias A = typeof(arg); 91 static if(is(A == char[]) || is(A == string)) { appender.put(arg); } 92 else static if(is(Unqual!A == dchar)) { appender.putDChar(arg); } 93 else static assert(false, "printNoGC does not support " ~ A.stringof); 94 } 95 96 return appender.data; 97 } 98 99 100 /// A UFCS utility function to write a dchar to an AppenderNoGCFixed using writeDCharTo. 101 /// 102 /// The char $(B must) be a valid dchar. 103 void putDChar(ref AppenderNoGCFixed!(char[], char) appender, dchar c) 104 @safe pure nothrow @nogc 105 { 106 char[4] dcharBuf; 107 if(c < 0x80) 108 { 109 dcharBuf[0] = cast(char)c; 110 appender.put(dcharBuf[0 .. 1]); 111 return; 112 } 113 // Should be safe to use as the first thing Reader does is validate everything. 114 const bytes = encodeValidCharNoGC(dcharBuf, c); 115 appender.put(dcharBuf[0 .. bytes]); 116 } 117 118 /// Convenience function that returns an $(D AppenderNoGCFixed!A) using with $(D array) 119 /// for storage. 120 AppenderNoGCFixed!(E[]) appenderNoGC(A : E[], E)(A array) 121 { 122 return AppenderNoGCFixed!(E[])(array); 123 } 124 125 /// A gutted, NoGC version of std.array.appender. 126 /// 127 /// Works on a fixed-size buffer. 128 struct AppenderNoGCFixed(A : T[], T) 129 { 130 import std.array; 131 132 private struct Data 133 { 134 size_t capacity; 135 Unqual!T[] arr; 136 bool canExtend = false; 137 } 138 139 private Data _data; 140 141 @nogc: 142 143 /// Construct an appender that will work with given buffer. 144 /// 145 /// Data written to the appender will overwrite the buffer from the start. 146 this(T[] arr) @trusted pure nothrow 147 { 148 // initialize to a given array. 149 _data.arr = cast(Unqual!T[])arr[0 .. 0]; //trusted 150 _data.capacity = arr.length; 151 } 152 153 /** 154 * Returns the capacity of the array (the maximum number of elements the 155 * managed array can accommodate before triggering a reallocation). If any 156 * appending will reallocate, $(D capacity) returns $(D 0). 157 */ 158 @property size_t capacity() const @safe pure nothrow 159 { 160 return _data.capacity; 161 } 162 163 /** 164 * Returns the managed array. 165 */ 166 @property inout(T)[] data() inout @trusted pure nothrow 167 { 168 /* @trusted operation: 169 * casting Unqual!T[] to inout(T)[] 170 */ 171 return cast(typeof(return))(_data.arr); 172 } 173 174 // ensure we can add nelems elements, resizing as necessary 175 private void ensureAddable(size_t nelems) @safe pure nothrow 176 { 177 assert(_data.capacity >= _data.arr.length + nelems, 178 "AppenderFixed ran out of space"); 179 } 180 181 void put(U)(U[] items) if (is(Unqual!U == T)) 182 { 183 // make sure we have enough space, then add the items 184 ensureAddable(items.length); 185 immutable len = _data.arr.length; 186 immutable newlen = len + items.length; 187 188 auto bigDataFun() @trusted nothrow { return _data.arr.ptr[0 .. newlen];} 189 auto bigData = bigDataFun(); 190 191 alias UT = Unqual!T; 192 193 bigData[len .. newlen] = items[]; 194 195 //We do this at the end, in case of exceptions 196 _data.arr = bigData; 197 } 198 199 // only allow overwriting data on non-immutable and non-const data 200 static if (isMutable!T) 201 { 202 /** 203 * Clears the managed array. This allows the elements of the array to be reused 204 * for appending. 205 * 206 * Note that clear is disabled for immutable or const element types, due to the 207 * possibility that $(D AppenderNoGCFixed) might overwrite immutable data. 208 */ 209 void clear() @safe pure nothrow 210 { 211 _data.arr = ()@trusted{ return _data.arr.ptr[0 .. 0]; }(); 212 } 213 } 214 else 215 { 216 /// Clear is not available for const/immutable data. 217 @disable void clear(); 218 } 219 } 220 unittest 221 { 222 char[256] buffer; 223 auto appender = appenderNoGC(buffer[]); 224 appender.put("found unsupported escape character: "); 225 appender.putDChar('a'); 226 appender.putDChar('á'); 227 assert(appender.data == "found unsupported escape character: aá"); 228 } 229 230 231 /// Result of a validateUTF8NoGC call. 232 struct ValidateResult 233 { 234 /// Is the validated string valid? 235 bool valid; 236 /// Number of characters in the string. 237 /// 238 /// If the string is not valid, this is the number of valid characters before 239 /// hitting the first invalid sequence. 240 size_t characterCount; 241 /// If the string is not valid, error message with details is here. 242 string msg; 243 } 244 245 /// Validate a UTF-8 string, checking if it is well-formed Unicode. 246 /// 247 /// See_Also: ValidateResult 248 ValidateResult validateUTF8NoGC(const(char[]) str) @trusted pure nothrow @nogc 249 { 250 immutable len = str.length; 251 size_t characterCount; 252 outer: for (size_t index = 0; index < len; ) 253 { 254 if(str[index] < 0x80) 255 { 256 ++index; 257 ++characterCount; 258 continue; 259 } 260 261 auto decoded = decodeUTF8NoGC!(No.validated)(str, index); 262 if(decoded.errorMessage !is null) 263 { 264 return ValidateResult(false, characterCount, decoded.errorMessage); 265 } 266 ++characterCount; 267 } 268 269 return ValidateResult(true, characterCount); 270 } 271 272 /// @nogc version of std.utf.decode() for char[]. 273 /// 274 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to 275 /// force code using this function to be efficient. 276 /// 277 /// Params: 278 /// 279 /// validated = If ture, assume str is a valid UTF-8 string and don't generate any 280 /// error-checking code. If validated is true, str $(B must) be a valid 281 /// character, otherwise undefined behavior will occur. Also affects the 282 /// return type. 283 /// str = Will decode the first code point from this string. 284 /// index = Index in str where the code point starts. Will be updated to point to 285 /// the next code point. 286 /// 287 /// Returns: If validated is true, the decoded character. 288 /// Otherwise a struct with a 'decoded' member - the decoded character, and a 289 /// 'string errorMessage' member that is null on success and otherwise stores 290 /// the error message. 291 auto decodeUTF8NoGC(Flag!"validated" validated)(const(char[]) str, ref size_t index) 292 @trusted pure nothrow @nogc 293 { 294 static if(!validated) struct Result 295 { 296 dchar decoded; 297 string errorMessage; 298 } 299 else alias Result = dchar; 300 301 /// Dchar bitmask for different numbers of UTF-8 code units. 302 enum bitMask = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); 303 304 auto pstr = str.ptr + index; 305 306 immutable length = str.length - index; 307 ubyte fst = pstr[0]; 308 309 assert(fst & 0x80); 310 enum invalidUTFMsg = "Invalid UTF-8 sequence"; 311 static if(!validated) { enum invalidUTF = Result(cast(dchar)int.max, invalidUTFMsg); } 312 313 // starter must have at least 2 first bits set 314 static if(validated) 315 { 316 assert((fst & 0b1100_0000) == 0b1100_0000, invalidUTFMsg); 317 } 318 else if((fst & 0b1100_0000) != 0b1100_0000) 319 { 320 return invalidUTF; 321 } 322 323 ubyte tmp = void; 324 dchar d = fst; // upper control bits are masked out later 325 fst <<= 1; 326 327 328 foreach (i; TypeTuple!(1, 2, 3)) 329 { 330 static if(validated) { assert(i != length, "Decoding out of bounds"); } 331 else if(i == length) { return Result(cast(dchar)int.max, "Decoding out of bounds"); } 332 333 tmp = pstr[i]; 334 static if(validated) { assert((tmp & 0xC0) == 0x80, invalidUTFMsg); } 335 else if((tmp & 0xC0) != 0x80) { return invalidUTF; } 336 337 d = (d << 6) | (tmp & 0x3F); 338 fst <<= 1; 339 340 if (!(fst & 0x80)) // no more bytes 341 { 342 d &= bitMask[i]; // mask out control bits 343 344 // overlong, could have been encoded with i bytes 345 static if(validated) { assert((d & ~bitMask[i - 1]) != 0, invalidUTFMsg); } 346 else if((d & ~bitMask[i - 1]) == 0) { return invalidUTF; } 347 348 // check for surrogates only needed for 3 bytes 349 static if (i == 2) 350 { 351 static if(validated) { assert(isValidDchar(d), invalidUTFMsg); } 352 else if(!isValidDchar(d)) { return invalidUTF; } 353 } 354 355 index += i + 1; 356 static if (i == 3) 357 { 358 static if(validated) { assert(d <= dchar.max, invalidUTFMsg); } 359 else if(d > dchar.max) { return invalidUTF; } 360 } 361 362 return Result(d); 363 } 364 } 365 366 static if(validated) { assert(false, invalidUTFMsg); } 367 else { return invalidUTF; } 368 } 369 370 /// @nogc version of std.utf.decode() for char[], but assumes str is valid UTF-8. 371 /// 372 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to 373 /// force code using this function to be efficient. 374 /// 375 /// Params: 376 /// 377 /// str = Will decode the first code point from this string. Must be valid UTF-8, 378 /// otherwise undefined behavior WILL occur. 379 /// index = Index in str where the code point starts. Will be updated to point to the 380 /// next code point. 381 alias decodeValidUTF8NoGC = decodeUTF8NoGC!(Yes.validated); 382 383 /// @nogc version of std.utf.encode() for char[]. 384 /// 385 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to 386 /// force code using this function to be efficient. 387 /// 388 /// Params: 389 /// validated = If true, asssume c is a valid, non-surrogate UTF-32 code point and don't 390 /// generate any error-checking code. If validated is true, c $(B must) be 391 /// a valid character, otherwise undefined behavior will occur. Also affects 392 /// the return type. 393 /// buf = Buffer to write the encoded result to. 394 /// c = Character to encode. 395 /// 396 /// Returns: If validated is true, number of bytes the encoded character takes up in buf. 397 /// Otherwise a struct with a 'bytes' member specifying the number of bytes of 398 /// the endocded character, and a 'string errorMessage' member that is null 399 /// if there was no error and otherwise stores the error message. 400 auto encodeCharNoGC(Flag!"validated" validated)(ref char[4] buf, dchar c) 401 @safe pure nothrow @nogc 402 { 403 static if(!validated) struct Result 404 { 405 size_t bytes; 406 string errorMessage; 407 } 408 else alias Result = size_t; 409 410 // Force the caller to optimize ASCII (the 1-byte case) 411 assert(c >= 0x80, "Caller should explicitly handle ASCII chars"); 412 if (c <= 0x7FF) 413 { 414 assert(isValidDchar(c)); 415 buf[0] = cast(char)(0xC0 | (c >> 6)); 416 buf[1] = cast(char)(0x80 | (c & 0x3F)); 417 return Result(2); 418 } 419 if (c <= 0xFFFF) 420 { 421 static if(validated) 422 { 423 assert(0xD800 > c || c > 0xDFFF, 424 "Supposedly valid code point is a surrogate code point"); 425 } 426 else if(0xD800 <= c && c <= 0xDFFF) 427 { 428 return Result(size_t.max, "Can't encode a surrogate code point in UTF-8"); 429 } 430 431 assert(isValidDchar(c)); 432 buf[0] = cast(char)(0xE0 | (c >> 12)); 433 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 434 buf[2] = cast(char)(0x80 | (c & 0x3F)); 435 return Result(3); 436 } 437 if (c <= 0x10FFFF) 438 { 439 assert(isValidDchar(c)); 440 buf[0] = cast(char)(0xF0 | (c >> 18)); 441 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 442 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 443 buf[3] = cast(char)(0x80 | (c & 0x3F)); 444 return Result(4); 445 } 446 447 assert(!isValidDchar(c)); 448 static if(!validated) 449 { 450 return Result(size_t.max, "Can't encode an invalid code point in UTF-8"); 451 } 452 else 453 { 454 assert(false, "Supposedly valid code point is invalid"); 455 } 456 } 457 458 /// @nogc version of std.utf.encode() for char[], but assumes c is a valid UTF-32 char. 459 /// 460 /// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to 461 /// force code using this function to be efficient. 462 /// 463 /// Params: 464 /// 465 /// buf = Buffer to write the encoded result to. 466 /// c = Character to encode. Must be valid UTF-32, otherwise undefined behavior 467 /// $(D will) occur. 468 /// 469 /// Returns: Number of bytes the encoded character takes up in buf. 470 alias encodeValidCharNoGC = encodeCharNoGC!(Yes.validated); 471 472 /// @nogc version of std.utf.isValidDchar 473 bool isValidDchar(dchar c) @safe pure nothrow @nogc 474 { 475 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 476 }