1 2 // Copyright Ferdinand Majerech 2011-2014.3 // Distributed under the Boost Software License, Version 1.0.4 // (See accompanying file LICENSE_1_0.txt or copy at5 // http://www.boost.org/LICENSE_1_0.txt)6 7 moduledyaml.reader;
8 9 10 importcore.stdc.stdlib;
11 importcore.stdc..string;
12 importcore.thread;
13 14 importstd.algorithm;
15 importstd.array;
16 importstd.conv;
17 importstd.exception;
18 importstd.stdio;
19 importstd..string;
20 importstd.system;
21 importstd.typecons;
22 importstd.utf;
23 24 importtinyendian;
25 26 importdyaml.fastcharsearch;
27 importdyaml.encoding;
28 importdyaml.exception;
29 importdyaml.nogcutil;
30 31 32 33 package:
34 35 36 ///Exception thrown at Reader errors.37 classReaderException : YAMLException38 {
39 this(stringmsg, stringfile = __FILE__, intline = __LINE__)
40 @safepurenothrow41 {
42 super("Reader error: " ~ msg, file, line);
43 }
44 }
45 46 /// Provides an API to read characters from a UTF-8 buffer and build slices into that47 /// buffer to avoid allocations (see SliceBuilder).48 finalclassReader49 {
50 private:
51 // Buffer of currently loaded characters.52 char[] buffer_ = null;
53 54 // Current position within buffer. Only data after this position can be read.55 size_tbufferOffset_ = 0;
56 57 // Index of the current character in the buffer.58 size_tcharIndex_ = 0;
59 // Number of characters (code points) in buffer_.60 size_tcharacterCount_ = 0;
61 62 // Current line in file.63 uintline_;
64 // Current column in file.65 uintcolumn_;
66 67 // Original Unicode encoding of the data.68 Encodingencoding_;
69 70 version(unittest)
71 {
72 // Endianness of the input before it was converted (for testing)73 Endianendian_;
74 }
75 76 // The number of consecutive ASCII characters starting at bufferOffset_.77 //78 // Used to minimize UTF-8 decoding.79 size_tupcomingASCII_ = 0;
80 81 // Index to buffer_ where the last decoded character starts.82 size_tlastDecodedBufferOffset_ = 0;
83 // Offset, relative to charIndex_, of the last decoded character,84 // in code points, not chars.85 size_tlastDecodedCharOffset_ = 0;
86 87 public:
88 /// Construct a Reader.89 ///90 /// Params: buffer = Buffer with YAML data. This may be e.g. the entire91 /// contents of a file or a string. $(B will) be modified by92 /// the Reader and other parts of D:YAML (D:YAML tries to93 /// reuse the buffer to minimize memory allocations)94 ///95 /// Throws: ReaderException on a UTF decoding error or if there are96 /// nonprintable Unicode characters illegal in YAML.97 this(ubyte[] buffer) @trustedpure//!nothrow98 {
99 autoendianResult = fixUTFByteOrder(buffer);
100 if(endianResult.bytesStripped > 0)
101 {
102 thrownewReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
103 "to 2 or 4 bytes, respectively");
104 }
105 106 version(unittest) { endian_ = endianResult.endian; }
107 encoding_ = endianResult.encoding;
108 109 autoutf8Result = toUTF8(endianResult.array, endianResult.encoding);
110 constmsg = utf8Result.errorMessage;
111 if(msg !isnull)
112 {
113 thrownewReaderException("Error when converting to UTF-8: " ~ msg);
114 }
115 116 buffer_ = utf8Result.utf8;
117 118 characterCount_ = utf8Result.characterCount;
119 // Check that all characters in buffer are printable.120 enforce(isPrintableValidUTF8(buffer_),
121 newReaderException("Special unicode characters are not allowed"));
122 123 this.sliceBuilder = SliceBuilder(this);
124 checkASCII();
125 }
126 127 purenothrow @nogc:
128 /// Get character at specified index relative to current position.129 ///130 /// Params: index = Index of the character to get relative to current position131 /// in the buffer. Can point outside of the buffer; In that132 /// case, '\0' will be returned.133 ///134 /// Returns: Character at specified position or '\0' if outside of the buffer.135 ///136 // XXX removed; search for 'risky' to find why.137 // Throws: ReaderException if trying to read past the end of the buffer.138 dcharpeek(constsize_tindex) @safe139 {
140 if(index < upcomingASCII_) { returnbuffer_[bufferOffset_ + index]; }
141 if(characterCount_ <= charIndex_ + index)
142 {
143 // XXX This is risky; revert this if bugs are introduced. We rely on144 // the assumption that Reader only uses peek() to detect end of buffer.145 // The test suite passes.146 // Revert this case here and in other peek() versions if this causes147 // errors.148 // throw new ReaderException("Trying to read past the end of the buffer");149 return'\0';
150 }
151 152 // Optimized path for Scanner code that peeks chars in linear order to153 // determine the length of some sequence.154 if(index == lastDecodedCharOffset_)
155 {
156 ++lastDecodedCharOffset_;
157 constcharb = buffer_[lastDecodedBufferOffset_];
158 // ASCII159 if(b < 0x80)
160 {
161 ++lastDecodedBufferOffset_;
162 returnb;
163 }
164 returndecodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
165 }
166 167 // 'Slow' path where we decode everything up to the requested character.168 constasciiToTake = min(upcomingASCII_, index);
169 lastDecodedCharOffset_ = asciiToTake;
170 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
171 dchard;
172 while(lastDecodedCharOffset_ <= index)
173 {
174 d = decodeNext();
175 }
176 177 returnd;
178 }
179 180 /// Optimized version of peek() for the case where peek index is 0.181 dcharpeek() @safe182 {
183 if(upcomingASCII_ > 0) { returnbuffer_[bufferOffset_]; }
184 if(characterCount_ <= charIndex_) { return'\0'; }
185 186 lastDecodedCharOffset_ = 0;
187 lastDecodedBufferOffset_ = bufferOffset_;
188 returndecodeNext();
189 }
190 191 /// Get byte at specified index relative to current position.192 ///193 /// Params: index = Index of the byte to get relative to current position194 /// in the buffer. Can point outside of the buffer; In that195 /// case, '\0' will be returned.196 ///197 /// Returns: Byte at specified position or '\0' if outside of the buffer.198 charpeekByte(constsize_tindex) @safe199 {
200 returncharacterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
201 }
202 203 /// Optimized version of peekByte() for the case where peek byte index is 0.204 charpeekByte() @safe205 {
206 returncharacterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
207 }
208 209 210 /// Get specified number of characters starting at current position.211 ///212 /// Note: This gets only a "view" into the internal buffer, which will be213 /// invalidated after other Reader calls. Use SliceBuilder to build slices214 /// for permanent use.215 ///216 /// Params: length = Number of characters (code points, not bytes) to get. May217 /// reach past the end of the buffer; in that case the returned218 /// slice will be shorter.219 ///220 /// Returns: Characters starting at current position or an empty slice if out of bounds.221 char[] prefix(constsize_tlength) @safe222 {
223 returnslice(length);
224 }
225 226 /// Get specified number of bytes, not code points, starting at current position.227 ///228 /// Note: This gets only a "view" into the internal buffer, which will be229 /// invalidated after other Reader calls. Use SliceBuilder to build slices230 /// for permanent use.231 ///232 /// Params: length = Number bytes (not code points) to get. May NOT reach past233 /// the end of the buffer; should be used with peek() to avoid234 /// this.235 ///236 /// Returns: Bytes starting at current position.237 char[] prefixBytes(constsize_tlength) @safe238 {
239 assert(length == 0 || bufferOffset_ + length < buffer_.length,
240 "prefixBytes out of bounds");
241 returnbuffer_[bufferOffset_ .. bufferOffset_ + length];
242 }
243 244 /// Get a slice view of the internal buffer, starting at the current position.245 ///246 /// Note: This gets only a "view" into the internal buffer,247 /// which get invalidated after other Reader calls.248 ///249 /// Params: end = End of the slice relative to current position. May reach past250 /// the end of the buffer; in that case the returned slice will251 /// be shorter.252 ///253 /// Returns: Slice into the internal buffer or an empty slice if out of bounds.254 char[] slice(constsize_tend) @safe255 {
256 // Fast path in case the caller has already peek()ed all the way to end.257 if(end == lastDecodedCharOffset_)
258 {
259 returnbuffer_[bufferOffset_ .. lastDecodedBufferOffset_];
260 }
261 262 constasciiToTake = min(upcomingASCII_, end, buffer_.length);
263 lastDecodedCharOffset_ = asciiToTake;
264 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
265 266 // 'Slow' path - decode everything up to end.267 while(lastDecodedCharOffset_ < end &&
268 lastDecodedBufferOffset_ < buffer_.length)
269 {
270 decodeNext();
271 }
272 273 returnbuffer_[bufferOffset_ .. lastDecodedBufferOffset_];
274 }
275 276 /// Get the next character, moving buffer position beyond it.277 ///278 /// Returns: Next character.279 ///280 /// Throws: ReaderException if trying to read past the end of the buffer281 /// or if invalid data is read.282 dcharget() @safe283 {
284 constresult = peek();
285 forward();
286 returnresult;
287 }
288 289 /// Get specified number of characters, moving buffer position beyond them.290 ///291 /// Params: length = Number or characters (code points, not bytes) to get.292 ///293 /// Returns: Characters starting at current position.294 char[] get(constsize_tlength) @safe295 {
296 autoresult = slice(length);
297 forward(length);
298 returnresult;
299 }
300 301 /// Move current position forward.302 ///303 /// Params: length = Number of characters to move position forward.304 voidforward(size_tlength) @safe305 {
306 mixinFastCharSearch!"\n\u0085\u2028\u2029"dsearch;
307 308 while(length > 0)
309 {
310 autoasciiToTake = min(upcomingASCII_, length);
311 charIndex_ += asciiToTake;
312 length -= asciiToTake;
313 upcomingASCII_ -= asciiToTake;
314 315 for(; asciiToTake > 0; --asciiToTake)
316 {
317 constc = buffer_[bufferOffset_++];
318 // c is ASCII, do we only need to check for ASCII line breaks.319 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
320 {
321 ++line_;
322 column_ = 0;
323 continue;
324 }
325 ++column_;
326 }
327 328 // If we have used up all upcoming ASCII chars, the next char is329 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to330 // be updated - it's zero.331 if(length == 0) { break; }
332 333 assert(upcomingASCII_ == 0,
334 "Running unicode handling code but we haven't run out of ASCII chars");
335 assert(bufferOffset_ < buffer_.length,
336 "Attempted to decode past the end of YAML buffer");
337 assert(buffer_[bufferOffset_] >= 0x80,
338 "ASCII must be handled by preceding code");
339 340 ++charIndex_;
341 constc = decodeValidUTF8NoGC(buffer_, bufferOffset_);
342 343 // New line. (can compare with '\n' without decoding since it's ASCII)344 if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
345 {
346 ++line_;
347 column_ = 0;
348 }
349 elseif(c != '\uFEFF') { ++column_; }
350 --length;
351 checkASCII();
352 }
353 354 lastDecodedBufferOffset_ = bufferOffset_;
355 lastDecodedCharOffset_ = 0;
356 }
357 358 /// Move current position forward by one character.359 voidforward() @trusted360 {
361 ++charIndex_;
362 lastDecodedBufferOffset_ = bufferOffset_;
363 lastDecodedCharOffset_ = 0;
364 365 // ASCII366 if(upcomingASCII_ > 0)
367 {
368 --upcomingASCII_;
369 constc = buffer_[bufferOffset_++];
370 371 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
372 {
373 ++line_;
374 column_ = 0;
375 return;
376 }
377 ++column_;
378 return;
379 }
380 381 // UTF-8382 mixinFastCharSearch!"\n\u0085\u2028\u2029"dsearch;
383 assert(bufferOffset_ < buffer_.length,
384 "Attempted to decode past the end of YAML buffer");
385 assert(buffer_[bufferOffset_] >= 0x80,
386 "ASCII must be handled by preceding code");
387 388 constc = decodeValidUTF8NoGC(buffer_, bufferOffset_);
389 390 // New line. (can compare with '\n' without decoding since it's ASCII)391 if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
392 {
393 ++line_;
394 column_ = 0;
395 }
396 elseif(c != '\uFEFF') { ++column_; }
397 398 checkASCII();
399 }
400 401 /// Used to build slices of read data in Reader; to avoid allocations.402 SliceBuildersliceBuilder;
403 404 @safepurenothrow @nogc:
405 /// Get a string describing current buffer position, used for error messages.406 Markmark() const { returnMark(line_, column_); }
407 408 /// Get current line number.409 uintline() const { returnline_; }
410 411 /// Get current column number.412 uintcolumn() const { returncolumn_; }
413 414 /// Get index of the current character in the buffer.415 size_tcharIndex() const { returncharIndex_; }
416 417 /// Get encoding of the input buffer.418 Encodingencoding() const { returnencoding_; }
419 420 private:
421 // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)422 voidcheckASCII()
423 {
424 upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
425 }
426 427 // Decode the next character relative to428 // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.429 //430 // Does not advance the buffer position. Used in peek() and slice().431 dchardecodeNext()
432 {
433 assert(lastDecodedBufferOffset_ < buffer_.length,
434 "Attempted to decode past the end of YAML buffer");
435 constcharb = buffer_[lastDecodedBufferOffset_];
436 ++lastDecodedCharOffset_;
437 // ASCII438 if(b < 0x80)
439 {
440 ++lastDecodedBufferOffset_;
441 returnb;
442 }
443 444 returndecodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
445 }
446 }
447 448 /// Used to build slices of already read data in Reader buffer, avoiding allocations.449 ///450 /// Usually these slices point to unchanged Reader data, but sometimes the data is451 /// changed due to how YAML interprets certain characters/strings.452 ///453 /// See begin() documentation.454 structSliceBuilder455 {
456 purenothrow @nogc:
457 private:
458 // No copying by the user.459 @disablethis(this);
460 @disablevoidopAssign(refSliceBuilder);
461 462 // Reader this builder works in.463 Readerreader_;
464 465 // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)466 size_tstart_ = size_t.max;
467 // End of the slice om reader_.buffer_ (size_t.max while no slice being build)468 size_tend_ = size_t.max;
469 470 // Stack of slice ends to revert to (see Transaction)471 //472 // Very few levels as we don't want arbitrarily nested transactions.473 size_t[4] endStack_;
474 // The number of elements currently in endStack_.475 size_tendStackUsed_ = 0;
476 477 @safeconstinvariant()
478 {
479 if(!inProgress) { return; }
480 assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
481 assert(start_ <= end_, "Slice start after slice end");
482 }
483 484 // Is a slice currently being built?485 boolinProgress() @safeconst486 {
487 assert(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max,
488 "start_/end_ are not consistent");
489 returnstart_ != size_t.max;
490 }
491 492 public:
493 /// Begin building a slice.494 ///495 /// Only one slice can be built at any given time; before beginning a new slice,496 /// finish the previous one (if any).497 ///498 /// The slice starts at the current position in the Reader buffer. It can only be499 /// extended up to the current position in the buffer; Reader methods get() and500 /// forward() move the position. E.g. it is valid to extend a slice by write()-ing501 /// a string just returned by get() - but not one returned by prefix() unless the502 /// position has changed since the prefix() call.503 voidbegin() @system504 {
505 assert(!inProgress, "Beginning a slice while another slice is being built");
506 assert(endStackUsed_ == 0, "Slice stack not empty at slice begin");
507 508 start_ = reader_.bufferOffset_;
509 end_ = reader_.bufferOffset_;
510 }
511 512 /// Finish building a slice and return it.513 ///514 /// Any Transactions on the slice must be committed or destroyed before the slice515 /// is finished.516 ///517 /// Returns a string; once a slice is finished it is definitive that its contents518 /// will not be changed.519 char[] finish() @system520 {
521 assert(inProgress, "finish called without begin");
522 assert(endStackUsed_ == 0, "Finishing a slice with running transactions.");
523 524 autoresult = reader_.buffer_[start_ .. end_];
525 start_ = end_ = size_t.max;
526 returnresult;
527 }
528 529 /// Write a string to the slice being built.530 ///531 /// Data can only be written up to the current position in the Reader buffer.532 ///533 /// If str is a string returned by a Reader method, and str starts right after the534 /// end of the slice being built, the slice is extended (trivial operation).535 ///536 /// See_Also: begin537 voidwrite(char[] str) @system538 {
539 assert(inProgress, "write called without begin");
540 assert(end_ <= reader_.bufferOffset_,
541 "AT START: Slice ends after buffer position");
542 543 // If str starts at the end of the slice (is a string returned by a Reader544 // method), just extend the slice to contain str.545 if(str.ptr == reader_.buffer_.ptr + end_)
546 {
547 end_ += str.length;
548 }
549 // Even if str does not start at the end of the slice, it still may be returned550 // by a Reader method and point to buffer. So we need to memmove.551 else552 {
553 core.stdc..string.memmove(reader_.buffer_.ptr + end_, cast(char*)str.ptr,
554 str.length * char.sizeof);
555 end_ += str.length;
556 }
557 }
558 559 /// Write a character to the slice being built.560 ///561 /// Data can only be written up to the current position in the Reader buffer.562 ///563 /// See_Also: begin564 voidwrite(dcharc) @system565 {
566 assert(inProgress, "write called without begin");
567 if(c < 0x80)
568 {
569 reader_.buffer_[end_++] = cast(char)c;
570 return;
571 }
572 573 // We need to encode a non-ASCII dchar into UTF-8574 char[4] encodeBuf;
575 constbytes = encodeValidCharNoGC(encodeBuf, c);
576 reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
577 end_ += bytes;
578 }
579 580 /// Insert a character to a specified position in the slice.581 ///582 /// Enlarges the slice by 1 char. Note that the slice can only extend up to the583 /// current position in the Reader buffer.584 ///585 /// Params:586 ///587 /// c = The character to insert.588 /// position = Position to insert the character at in code units, not code points.589 /// Must be less than slice length(); a previously returned length()590 /// can be used.591 voidinsert(constdcharc, constsize_tposition) @system592 {
593 assert(inProgress, "insert called without begin");
594 assert(start_ + position <= end_, "Trying to insert after the end of the slice");
595 596 constpoint = start_ + position;
597 constmovedLength = end_ - point;
598 599 // Encode c into UTF-8600 char[4] encodeBuf;
601 if(c < 0x80) { encodeBuf[0] = cast(char)c; }
602 constsize_tbytes = c < 0x80 ? 1 : encodeValidCharNoGC(encodeBuf, c);
603 604 if(movedLength > 0)
605 {
606 core.stdc..string.memmove(reader_.buffer_.ptr + point + bytes,
607 reader_.buffer_.ptr + point,
608 movedLength * char.sizeof);
609 }
610 reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
611 end_ += bytes;
612 }
613 614 /// Get the current length of the slice.615 size_tlength() @safeconst616 {
617 returnend_ - start_;
618 }
619 620 /// A slice building transaction.621 ///622 /// Can be used to save and revert back to slice state.623 structTransaction624 {
625 @systempurenothrow @nogc:
626 private:
627 // The slice builder affected by the transaction.628 SliceBuilder* builder_ = null;
629 // Index of the return point of the transaction in StringBuilder.endStack_.630 size_tstackLevel_;
631 // True after commit() has been called.632 boolcommitted_;
633 634 public:
635 /// Begins a transaction on a SliceBuilder object.636 ///637 /// The transaction must end $(B after) any transactions created within the638 /// transaction but $(B before) the slice is finish()-ed. A transaction can be639 /// ended either by commit()-ing or reverting through the destructor.640 ///641 /// Saves the current state of a slice.642 this(refSliceBuilderbuilder)
643 {
644 builder_ = &builder;
645 stackLevel_ = builder_.endStackUsed_;
646 builder_.push();
647 }
648 649 /// Commit changes to the slice.650 ///651 /// Ends the transaction - can only be called once, and removes the possibility652 /// to revert slice state.653 ///654 /// Does nothing for a default-initialized transaction (the transaction has not655 /// been started yet).656 voidcommit()
657 {
658 assert(!committed_, "Can't commit a transaction more than once");
659 660 if(builder_isnull) { return; }
661 assert(builder_.endStackUsed_ == stackLevel_ + 1,
662 "Parent transactions don't fully contain child transactions");
663 builder_.apply();
664 committed_ = true;
665 }
666 667 /// Destroy the transaction and revert it if it hasn't been committed yet.668 ///669 /// Does nothing for a default-initialized transaction.670 ~this()
671 {
672 if(builder_isnull || committed_) { return; }
673 assert(builder_.endStackUsed_ == stackLevel_ + 1,
674 "Parent transactions don't fully contain child transactions");
675 builder_.pop();
676 builder_ = null;
677 }
678 }
679 680 private:
681 // Push the current end of the slice so we can revert to it if needed.682 //683 // Used by Transaction.684 voidpush() @system685 {
686 assert(inProgress, "push called without begin");
687 assert(endStackUsed_ < endStack_.length, "Slice stack overflow");
688 endStack_[endStackUsed_++] = end_;
689 }
690 691 // Pop the current end of endStack_ and set the end of the slice to the popped692 // value, reverting changes since the old end was pushed.693 //694 // Used by Transaction.695 voidpop() @system696 {
697 assert(inProgress, "pop called without begin");
698 assert(endStackUsed_ > 0, "Trying to pop an empty slice stack");
699 end_ = endStack_[--endStackUsed_];
700 }
701 702 // Pop the current end of endStack_, but keep the current end of the slice, applying703 // changes made since pushing the old end.704 //705 // Used by Transaction.706 voidapply() @system707 {
708 assert(inProgress, "apply called without begin");
709 assert(endStackUsed_ > 0, "Trying to apply an empty slice stack");
710 --endStackUsed_;
711 }
712 }
713 714 715 private:
716 717 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.718 //719 // Params:720 //721 // input = Buffer with UTF-8/16/32 data to decode. May be overwritten by the722 // conversion, in which case the result will be a slice of this buffer.723 // encoding = Encoding of input.724 //725 // Returns:726 //727 // A struct with the following members:728 //729 // $(D string errorMessage) In case of an error, the error message is stored here. If730 // there was no error, errorMessage is NULL. Always check731 // this first.732 // $(D char[] utf8) input converted to UTF-8. May be a slice of input.733 // $(D size_t characterCount) Number of characters (code points) in input.734 autotoUTF8(ubyte[] input, constUTFEncodingencoding) @safepurenothrow735 {
736 // Documented in function ddoc.737 structResult738 {
739 stringerrorMessage;
740 char[] utf8;
741 size_tcharacterCount;
742 }
743 744 Resultresult;
745 746 // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.747 //748 // Params:749 //750 // buffer = The input buffer to encode.751 // result = A Result struct to put encoded result and any error messages to.752 //753 // On error, result.errorMessage will be set.754 staticvoidencode(C)(C[] input, refResultresult) @safepure755 {
756 // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or757 // less bytes.758 staticif(is(C == dchar))
759 {
760 char[4] encodeBuf;
761 autoutf8 = cast(char[])input;
762 autolength = 0;
763 foreach(dcharc; input)
764 {
765 ++result.characterCount;
766 // ASCII767 if(c < 0x80)
768 {
769 utf8[length++] = cast(char)c;
770 continue;
771 }
772 773 constencodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c);
774 if(encodeResult.errorMessage !isnull)
775 {
776 result.errorMessage = encodeResult.errorMessage;
777 return;
778 }
779 constbytes = encodeResult.bytes;
780 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
781 length += bytes;
782 }
783 result.utf8 = utf8[0 .. length];
784 }
785 // Unfortunately we can't do UTF-16 in place so we just use std.conv.to786 else787 {
788 result.characterCount = std.utf.count(input);
789 result.utf8 = input.to!(char[]);
790 }
791 }
792 793 tryfinalswitch(encoding)
794 {
795 caseUTFEncoding.UTF_8:
796 result.utf8 = cast(char[])input;
797 constvalidateResult = result.utf8.validateUTF8NoGC();
798 if(!validateResult.valid)
799 {
800 result.errorMessage = "UTF-8 validation error after character #" ~
801 validateResult.characterCount.to!string ~ ": " ~
802 validateResult.msg;
803 }
804 result.characterCount = validateResult.characterCount;
805 break;
806 caseUTFEncoding.UTF_16:
807 assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
808 encode(cast(wchar[])input, result);
809 break;
810 caseUTFEncoding.UTF_32:
811 assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
812 encode(cast(dchar[])input, result);
813 break;
814 }
815 catch(ConvExceptione) { result.errorMessage = e.msg; }
816 catch(UTFExceptione) { result.errorMessage = e.msg; }
817 catch(Exceptione)
818 {
819 assert(false, "Unexpected exception in encode(): " ~ e.msg);
820 }
821 822 returnresult;
823 }
824 825 /// Determine if all characters (code points, not bytes) in a string are printable.826 boolisPrintableValidUTF8(constchar[] chars) @trustedpurenothrow @nogc827 {
828 // This is oversized (only 128 entries are necessary) simply because having 256829 // entries improves performance... for some reason (alignment?)830 bool[256] printable = [false, false, false, false, false, false, false, false,
831 false, true, true, false, false, true, false, false,
832 false, false, false, false, false, false, false, false,
833 false, false, false, false, false, false, false, false,
834 835 true, true, true, true, true, true, true, true,
836 true, true, true, true, true, true, true, true,
837 true, true, true, true, true, true, true, true,
838 true, true, true, true, true, true, true, true,
839 840 true, true, true, true, true, true, true, true,
841 true, true, true, true, true, true, true, true,
842 true, true, true, true, true, true, true, true,
843 true, true, true, true, true, true, true, true,
844 true, true, true, true, true, true, true, true,
845 true, true, true, true, true, true, true, true,
846 true, true, true, true, true, true, true, true,
847 true, true, true, true, true, true, true, true,
848 849 false, false, false, false, false, false, false, false,
850 false, false, false, false, false, false, false, false,
851 false, false, false, false, false, false, false, false,
852 false, false, false, false, false, false, false, false,
853 false, false, false, false, false, false, false, false,
854 false, false, false, false, false, false, false, false,
855 false, false, false, false, false, false, false, false,
856 false, false, false, false, false, false, false, false,
857 858 false, false, false, false, false, false, false, false,
859 false, false, false, false, false, false, false, false,
860 false, false, false, false, false, false, false, false,
861 false, false, false, false, false, false, false, false,
862 false, false, false, false, false, false, false, false,
863 false, false, false, false, false, false, false, false,
864 false, false, false, false, false, false, false, false,
865 false, false, false, false, false, false, false, false];
866 867 for(size_tindex = 0; index < chars.length;)
868 {
869 // Fast path for ASCII.870 // Both this while() block and the if() block below it are optimized, unrolled871 // versions of the for() block below them; the while()/if() block could be872 // removed without affecting logic, but both help increase performance.873 size_tasciiCount = countASCII(chars[index .. $]);
874 // 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars.875 while(asciiCount > 8)
876 {
877 constdcharb0 = chars[index];
878 constdcharb1 = chars[index + 1];
879 constdcharb2 = chars[index + 2];
880 constdcharb3 = chars[index + 3];
881 constdcharb4 = chars[index + 4];
882 constdcharb5 = chars[index + 5];
883 constdcharb6 = chars[index + 6];
884 constdcharb7 = chars[index + 7];
885 886 index += 8;
887 asciiCount -= 8;
888 889 constall = printable[b0] & printable[b1] & printable[b2] & printable[b3] &
890 printable[b4] & printable[b5] & printable[b6] & printable[b1];
891 if(!all)
892 {
893 returnfalse;
894 }
895 }
896 // 4 ASCII iterations unrolled897 if(asciiCount > 4)
898 {
899 constcharb0 = chars[index];
900 constcharb1 = chars[index + 1];
901 constcharb2 = chars[index + 2];
902 constcharb3 = chars[index + 3];
903 904 index += 4;
905 asciiCount -= 4;
906 907 if(!printable[b0]) { returnfalse; }
908 if(!printable[b1]) { returnfalse; }
909 if(!printable[b2]) { returnfalse; }
910 if(!printable[b3]) { returnfalse; }
911 }
912 // Any remaining ASCII chars. This is really the only code needed to handle913 // ASCII, the above if() and while() blocks are just an optimization.914 for(; asciiCount > 0; --asciiCount)
915 {
916 constcharb = chars[index];
917 ++index;
918 if(b >= 0x20) { continue; }
919 if(printable[b]) { continue; }
920 returnfalse;
921 }
922 923 if(index == chars.length) { break; }
924 925 // Not ASCII, need to decode.926 constdcharc = decodeValidUTF8NoGC(chars, index);
927 // We now c is not ASCII, so only check for printable non-ASCII chars.928 if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
929 (c >= '\uE000' && c <= '\uFFFD')))
930 {
931 returnfalse;
932 }
933 }
934 returntrue;
935 }
936 937 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.938 ///939 /// Used to determine how many characters we can process without decoding.940 size_tcountASCII(const(char)[] buffer) @trustedpurenothrow @nogc941 {
942 size_tcount = 0;
943 944 // The topmost bit in ASCII characters is always 0945 enumulongMask8 = 0x7f7f7f7f7f7f7f7f;
946 enumuintMask4 = 0x7f7f7f7f;
947 enumushortMask2 = 0x7f7f;
948 949 // Start by checking in 8-byte chunks.950 while(buffer.length >= Mask8.sizeof)
951 {
952 constblock = *cast(typeof(Mask8)*)buffer.ptr;
953 constmasked = Mask8 & block;
954 if(masked != block) { break; }
955 count += Mask8.sizeof;
956 buffer = buffer[Mask8.sizeof .. $];
957 }
958 959 // If 8 bytes didn't match, try 4, 2 bytes.960 importstd.typetuple;
961 foreach(Mask; TypeTuple!(Mask4, Mask2))
962 {
963 if(buffer.length < Mask.sizeof) { continue; }
964 constblock = *cast(typeof(Mask)*)buffer.ptr;
965 constmasked = Mask & block;
966 if(masked != block) { continue; }
967 count += Mask.sizeof;
968 buffer = buffer[Mask.sizeof .. $];
969 }
970 971 // If even a 2-byte chunk didn't match, test just one byte.972 if(buffer.empty || buffer[0] >= 0x80) { returncount; }
973 ++count;
974 975 returncount;
976 }
977 // Unittests.978 979 voidtestEndian(R)()
980 {
981 writeln(typeid(R).toString() ~ ": endian unittest");
982 voidendian_test(ubyte[] data, Encodingencoding_expected, Endianendian_expected)
983 {
984 autoreader = newR(data);
985 assert(reader.encoding == encoding_expected);
986 assert(reader.endian_ == endian_expected);
987 }
988 ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
989 ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
990 endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
991 endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
992 }
993 994 voidtestPeekPrefixForward(R)()
995 {
996 importdyaml.stream;
997 writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest");
998 ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
999 autoreader = newR(data);
1000 assert(reader.peek() == 'd');
1001 assert(reader.peek(1) == 'a');
1002 assert(reader.peek(2) == 't');
1003 assert(reader.peek(3) == 'a');
1004 assert(reader.peek(4) == '\0');
1005 assert(reader.prefix(4) == "data");
1006 // assert(reader.prefix(6) == "data\0");1007 reader.forward(2);
1008 assert(reader.peek(1) == 'a');
1009 // assert(collectException(reader.peek(3)));1010 }
1011 1012 voidtestUTF(R)()
1013 {
1014 importdyaml.stream;
1015 writeln(typeid(R).toString() ~ ": UTF formats unittest");
1016 dchar[] data = cast(dchar[])"data";
1017 voidutf_test(T)(T[] data, BOMbom)
1018 {
1019 ubyte[] bytes = ByteOrderMarks[bom] ~
1020 (cast(ubyte[])data)[0 .. data.length * T.sizeof];
1021 autoreader = newR(bytes);
1022 assert(reader.peek() == 'd');
1023 assert(reader.peek(1) == 'a');
1024 assert(reader.peek(2) == 't');
1025 assert(reader.peek(3) == 'a');
1026 }
1027 utf_test!char(to!(char[])(data), BOM.UTF8);
1028 utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
1029 utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
1030 }
1031 1032 voidtest1Byte(R)()
1033 {
1034 writeln(typeid(R).toString() ~ ": 1 byte file unittest");
1035 ubyte[] data = [97];
1036 1037 autoreader = newR(data);
1038 assert(reader.peek() == 'a');
1039 assert(reader.peek(1) == '\0');
1040 // assert(collectException(reader.peek(2)));1041 }
1042 1043 unittest1044 {
1045 testEndian!Reader();
1046 testPeekPrefixForward!Reader();
1047 testUTF!Reader();
1048 test1Byte!Reader();
1049 }