1 /**
2  * iz string handling functions, mostly related to lexical scanning
3  */
4 module iz.strings;
5 
6 import
7     std.range, std.traits, std.algorithm.searching;
8 import
9     iz.sugar, iz.types;
10 
11 version(unittest) import std.stdio;
12 
13 // Character-related-structs --------------------------------------------------+
14 
15 /**
16  * CharRange is an helper struct that allows to test
17  * if a char is within a full range of characters.
18  */
19 struct CharRange
20 {
21     import std.conv: to;
22     private immutable dchar _min, _max;
23     
24     /**
25      * Constructs the char range using a string that contains the range bounds.
26      *
27      * Params:
28      *      s = A string. It neither has to be sorted nor to contain the full range.
29      */
30     this(S)(S s) pure @safe
31     if (isSomeString!S)
32     {
33         import std.algorithm.sorting: sort;
34         auto sorted = sort(to!(dchar[])(s));
35         _min = sorted[0];
36         _max = sorted[$-1];
37     }
38     
39     /**
40      * Constructs the char range using the two chars passed as argument.
41      *
42      * Params:
43      *      cmin: The lower character in the range.
44      *      cmax: The upper (inclusive) character in the range.
45      */
46     this(C)(C cmin, C cmax) pure @safe
47     if (isSomeChar!C || isImplicitlyConvertible!(C, dchar))
48     {
49         auto const maybeMin = to!dchar(cmin);
50         auto const maybeMax = to!dchar(cmax);
51         if (maybeMin <= maybeMax)
52         {
53             _min = maybeMin;
54             _max = maybeMax;
55         }
56         else
57         {
58             _min = maybeMax;
59             _max = maybeMin;
60         }     
61     }
62     
63     /// Returns the lower char in the range.
64     dchar min() pure nothrow @safe @nogc
65     {return _min;}
66     
67     /// Returns the upper char in the range.
68     dchar max() pure nothrow @safe @nogc
69     {return _max;}
70 
71     /**
72      * Returns true if a character is within the range.
73      *
74      * Params:
75      *      c = A character or any value convertible to a dchar.
76      */
77     bool opBinaryRight(string op = "in", C)(C c) const pure nothrow @safe @nogc
78     if (op == "in")
79     {
80         static if (isSomeChar!C || isImplicitlyConvertible!(C, dchar))
81         {
82             return ((c >= _min) & (c <= _max)); 
83         }
84         else static assert(0, "invalid argument type for CharRange.opIn_r(): " ~ C.stringof);
85     }
86     
87     /**
88      * Returns the range representation, as a string.
89      * This function fails if the range is not within the 0x0 .. 0x80 range.
90      */
91     string toString() const pure @safe
92     {
93         auto r = iota(_min, _max+1);
94         string result;
95         while (!r.empty)
96         {
97             result ~= to!char(r.front);
98             r.popFront;
99         }
100         return result;
101     }
102 }
103 ///
104 pure @safe unittest
105 {
106     auto cs1 = CharRange("ajslkdfjlz");
107     assert(cs1.min == 'a');
108     assert(cs1.max == 'z');
109     assert('b' in cs1);
110     
111     auto cs2 = CharRange('f', 'a');
112     assert(cs2.min == 'a');
113     assert(cs2.max == 'f');
114     assert('b' in cs2);
115     assert('g' !in cs2);
116     assert(cs2.toString == "abcdef", cs2.toString);
117     
118     auto cs3 = CharRange(65, 70);
119     assert(cs3.min == 65);
120     assert(cs3.max == 70);
121     assert(66 in cs3);
122     assert(71 !in cs3);
123 }
124 
125 /// a CharSwitch that verify characters for decimal numbers.
126 static immutable CharSwitch!("[0..9]") decimalChars;
127 /// a CharSwitch that verify characters for octal numbers.
128 static immutable CharSwitch!("[0..7]") octalChars;
129 
130 /**
131  * CharMap is an helper struct that allows to test
132  * if a char is within a set of characters.
133  */
134 struct CharMap
135 {
136     private bool[] _map;
137     private dchar _min, _max;
138     
139     private void setMinMax(dchar value) pure nothrow @safe
140     {
141         if (value <= _min) _min = value;
142         else if (value >= _max) _max = value;
143         _map.length = _max + 1 - _min; 
144     }
145 
146     /**
147      * Used in the construction process.
148      *
149      * Params:
150      *      lo = The dchar that defines the range lower bound.
151      *      hi = The dchar that defines the range upper bound (inclusive).
152      *
153      * Examples:
154      * ---
155      * CharMap cm = CharMap['0'..'9'];
156      * ---
157      */
158     static CharRange opSlice(int index)(dchar lo, dchar hi) pure nothrow @safe @nogc
159     {
160         return CharRange(lo, hi);
161     }
162     
163     /**
164      * Used in the construction process.
165      *
166      * Params:
167      *      a = A list made of character slices, of single characters or
168      *
169      * any other values whose type is implicitly convertible to dchar.
170      *
171      * Examples:
172      * ---
173      * CharMap cm = CharMap['0'..'9', '.', 'f', 'd', 38, 39];
174      * ---
175      */
176     static CharMap opIndex(A...)(A a) pure nothrow @safe
177     {   
178         CharMap result;
179         
180         // bounds
181         foreach(elem; a)
182         {
183             alias T = typeof(elem);
184             static if (isSomeChar!T || isImplicitlyConvertible!(T, dchar))
185             {
186                 result.setMinMax(elem);      
187             }
188             else static if (is(T == CharRange))
189             {
190                 result.setMinMax(elem._min);
191                 result.setMinMax(elem._max);    
192             }
193             else static assert(0, "unsupported opIndex argument type: " ~ T.stringof);
194         }
195         
196         result._map[] = false;   
197         foreach(elem; a)
198         {    
199             alias T = typeof(elem);
200             static if (isSomeChar!T || isImplicitlyConvertible!(T, dchar))
201                 result._map[elem - result._min] = true;   
202             else static if (is(T == CharRange))
203             {
204                 foreach(size_t i; elem._min - result._min .. elem._max - result._min + 1)
205                     result._map[i] = true;
206             }
207         }
208         return result;
209     }
210     
211     /**
212      * Returns true if a character is within the map.
213      *
214      * Params:
215      *      c = A character or any value convertible to a dchar.
216      */
217     bool opBinaryRight(string op = "in", C)(C c) const pure nothrow @safe @nogc
218     if (op == "in")
219     {
220         static if (isSomeChar!C || isImplicitlyConvertible!(C, dchar))
221         {
222             if (_min > c || c > _max) return false;
223             else return _map[c - _min]; 
224         }
225         else static assert(0, "invalid argument type for CharMap.opIn_r(): " ~ C.stringof);
226     }
227 }
228 ///
229 pure @safe unittest
230 {
231     CharMap cm = CharMap['a'..'f', '0'..'9' , 'A'..'F', '_', 9];
232     assert('a' in cm);
233     assert('b' in cm);
234     assert('c' in cm);
235     assert('d' in cm);
236     assert('e' in cm);
237     assert('f' in cm);
238     assert('g' !in cm);
239     assert('A' in cm);
240     assert('B' in cm);
241     assert('C' in cm);
242     assert('D' in cm);
243     assert('E' in cm);
244     assert('F' in cm);
245     assert('G' !in cm);
246     assert('0' in cm);
247     assert('4' in cm);
248     assert('9' in cm);
249     assert('_' in cm);
250     assert('%' !in cm);
251     assert('\t' in cm);
252 }
253 
254 /// A CharSwitch that includes the hexadecimal characters.
255 static immutable CharSwitch!("[a..f]", "[A..F]", "[0..9]") hexChars;
256 /// A CharSwitch that includes the white characters.
257 static immutable CharSwitch!("[\t..\r]", ' ') whiteChars;
258 
259 
260 /**
261  * The CharSwitch structure allows to define static sets of character.
262  *
263  * Unlike CharMap it's more adapted to sets made of unicode characters
264  * because the characters are tested with a switch table that's generated at
265  * compile-time rather than reading, with an indirection, a sparse array of booleans.
266  *
267  * The structure uniquely implements the $(D in) operator.
268  *
269  * Params:
270  *      A = Variadic parameters made of literal characters, integer numbers or
271  *      character ranges literals, such as $(D "[0..9]") or $(D "[a..z]").
272  */
273 struct CharSwitch(A...)
274 if (A.length)
275 {
276 
277     import std.conv: to;
278     import std.range: iota;
279     import std.meta: aliasSeqOf;
280 
281     private alias case_ = (uint v) => "\tcase " ~ to!string(v) ~ ": return true;\n";
282 
283     // CTFE: generates cases for single chars: 'a', 'é', 57, ...
284     private static string caseForChar(V)(V value)
285     if (isSomeChar!V || isIntegral!V)
286     {
287         return case_(value);
288     }
289 
290     // CTFE: generates cases for char ranges: "[a..g]", '[0..9]', ...
291     private static string casesForRange(string value) @safe pure
292     {
293         enum err = "invalid character range format, the format must respect [<lo>..<hi>]";
294         assert(!value.empty, err);
295         assert(value.front == '[', err);
296         value.popFront;
297 
298         size_t i = 1;
299         dchar min = value.front;
300         if (min == '\\')
301         {
302             while (true)
303             {
304                 if (value.empty || value.front == '.')
305                     break;
306 
307                 value.popFront;
308                 ++i;
309             }
310             min = value[1..i].to!dchar;
311         }
312 
313         assert(!value.empty, err);
314         value.popFront;
315         assert(!value.empty, err);
316         assert(value.front == '.', err);
317         value.popFront;
318         assert(!value.empty, err);
319         assert(value.front == '.', err);
320         value.popFront;
321 
322         dchar max = value.front;
323         if (max == '\\')
324         {
325             ++++i;
326             const lo = i;
327             while (true)
328             {
329                 if (value.empty || value.front == '.')
330                     break;
331 
332                 value.popFront;
333                 ++i;
334             }
335             max = value[lo..i].to!dchar;
336         }
337 
338         assert(min < max, "invalid character range: " ~ min.to!string ~ " > " ~ max.to!string);
339         assert(!value.empty, err);
340         value.popFront;
341         assert(value.front == ']', err);
342         assert(!value.empty, err);
343         value.popFront;
344         assert(value.empty, err);
345 
346         string result;
347         foreach(c; iota(min, max + 1))
348         {
349             result ~= caseForChar(c);
350         }
351         return result;
352     }
353 
354     // CTFE: makes the switch
355     private static string makeOpIn(A...)()
356     {
357         string result = "static bool opIn_r(dchar value) @safe @nogc pure nothrow"
358             ~ "\n{\n\tswitch(value)\n\t{\n\tdefault: return false;\n ";
359 
360         foreach(i; aliasSeqOf!(iota(0, A.length)))
361         {
362             alias T = typeof(A[i]);
363             static if (isSomeChar!T)
364             {
365                 result ~= caseForChar(A[i]);
366             }
367             else static if (isIntegral!T)
368             {
369                 static if (dchar.min <= A[i] && A[i] <= dchar.max)
370                     result ~= caseForChar(A[i]);
371                 else
372                     static assert(0, "integral value exceeds dchar.max");
373             }
374             else static if (is(T == string))
375             {
376                 result ~= casesForRange(A[i]);
377             }
378             else
379                 static assert(0, "invalid argument type: " ~ T.stringof);
380         }
381 
382         result ~= "\t}\n};";
383 
384         return result;
385     }
386 
387     mixin(makeOpIn!A);
388 }
389 ///
390 @nogc @safe pure unittest
391 {
392     // syntax example for a character range
393     alias digits = static immutable CharSwitch!("[0..9]") ;
394     assert('0' in digits);
395     assert('e' !in digits);
396 
397     // many ranges are accepted
398     alias hexdgs = static immutable CharSwitch!("[0..9]", "[A..F]", "[a..f]");
399     assert('0' in hexdgs);
400     assert('e' in hexdgs);
401     assert('f' in hexdgs);
402     assert('G' !in hexdgs);
403 
404     // ranges and chars can be mixed
405     alias frenchchars = static immutable CharSwitch!('à','é','è','ç',"[a..z]","[A..Z]",'ù');
406     assert('ß' !in frenchchars);
407     assert('é' in frenchchars);
408 }
409 
410 
411 /**
412  * Returns an input range that processes directly a null terminated C string,
413  * without fully converting it to a phobos string.
414  *
415  * Params:
416  *      decode = When set to true the front is decoded otherwise (the default)
417  *          each code point is supposed to contain 1 unit.
418  *      c = A pointer to a character.
419  * Returns:
420  *      When decoding is enabled, nullTerminated always returns a range of dchar
421  *      otherwise the front type is the same as target type of the pointer passed
422  *      as parameter.
423  */
424 auto nullTerminated(bool decode = false, C)(C c)
425 if (isPointer!C && isSomeChar!(PointerTarget!(C)))
426 {
427     // trusting:
428     // - read only = no corruption
429     // - at the end we always know what happens:
430     //      - null pointer encountered if valid string ptr passed.
431     //      - null pointer encountered after a while and if invalid ptr passed, but still not corruption.
432     //      - otherwise segfault, when the range goes over the process memory.
433     struct NullTerminated(C)
434     {
435         private C _front;
436         enum dec = !is(C == dchar*) && decode;
437         static if (dec) private size_t _cnt;
438 
439         private this(C c) @trusted @nogc
440         {
441             _front = c;
442         }
443         ///
444         bool empty() @trusted @nogc
445         {
446             return !_front || *_front == 0;
447         }
448         ///
449         auto front() @trusted
450         {
451             import std.utf: decodeFront;
452             static if (dec)
453             {
454                 auto frt = _front[0 .. 4 / (PointerTarget!C).sizeof];
455                 return decodeFront(frt, _cnt);
456             }
457             else return *_front;
458         }
459         ///
460         void popFront() @trusted @nogc
461         {
462             static if (dec)
463                 _front += _cnt;
464             else
465                 ++_front;
466         }
467         ///
468         C save() @trusted @nogc
469         {
470             return _front;
471         }
472     }
473     return NullTerminated!C(c);
474 }
475 ///
476 pure @safe unittest
477 {
478     auto text = "ab cd\0";
479     auto cString = nullTerminated(&text[0]);
480     assert(nextWord(cString) == "ab");
481     auto saved = nullTerminated(cString.save);
482     assert(nextWord(cString) == "cd");
483     assert(nextWord(saved) == "cd");
484     assert(cString.empty);
485     auto wtext = "ab cd\0"w;
486     auto cWideString = nullTerminated(&wtext[0]);
487     assert(nextWord(cWideString) == "ab"w);
488     assert(nextWord(cWideString) == "cd"w);
489     assert(cWideString.empty);
490 }
491 
492 pure @safe unittest
493 {
494     auto text = "été\0";
495     auto cString = nullTerminated!true(&text[0]);
496     assert(cString.front == 'é');
497     cString.popFront;
498     assert(cString.front == 't');
499     cString.popFront;
500     assert(cString.front == 'é');
501     cString.popFront;
502     assert(cString.empty);
503 }
504 
505 pure @nogc @safe unittest
506 {
507     char* text;
508     assert(text.nullTerminated.empty);
509 }
510 
511 pure @safe unittest
512 {
513     auto text = "été\0"w;
514     auto cString = nullTerminated!true(&text[0]);
515     assert(cString.front == 'é');
516     cString.popFront;
517     assert(cString.front == 't');
518     cString.popFront;
519     assert(cString.front == 'é');
520     cString.popFront;
521     assert(cString.empty);
522 }
523 
524 pure @safe unittest
525 {
526     auto text = "été\0"d;
527     {
528         auto cString = nullTerminated!true(&text[0]);
529         assert(cString.front == 'é');
530         cString.popFront;
531         assert(cString.front == 't');
532         cString.popFront;
533         assert(cString.front == 'é');
534         cString.popFront;
535         assert(cString.empty);
536     }
537     {
538         auto cString = nullTerminated!false(&text[0]);
539         assert(cString.front == 'é');
540         cString.popFront;
541         assert(cString.front == 't');
542         cString.popFront;
543         assert(cString.front == 'é');
544         cString.popFront;
545         assert(cString.empty);
546     }
547 }
548 
549 
550 // -----------------------------------------------------------------------------
551 // Generic Scanning functions -------------------------------------------------+
552 private template CharType(T)
553 {
554     alias CharType = Unqual!(ElementEncodingType!T);
555 }
556 
557 /**
558  * Tests wether $(D T) is supported in the several scanning functions.
559  *
560  * T must either be a CharRange, a CharMap, supports std.algorithm.searching.canFind,
561  * be a callable of type $(D bool(dchar)) or $(D bool function(dchar)) or be a
562  * single dchar.
563  */
564 template isCharTester(T)
565 {
566     static if (isInputRange!T && isSomeChar!(ElementType!T))
567         enum isCharTester = true;
568     else static if (is(Unqual!T == CharRange))
569         enum isCharTester = true;
570     else static if (is(Unqual!T == CharMap))
571         enum isCharTester = true;
572     else static if (isAssociativeArray!T && isSomeChar!(KeyType!T))
573         enum isCharTester = true;
574     else static if (isSomeFunction!T && is(ReturnType!T == bool) &&
575         Parameters!T.length == 1 && is(Parameters!T[0] == dchar))
576         enum isCharTester = true;
577     else static if (isSomeChar!T)
578         enum isCharTester = true;
579     else
580     {
581         alias B = TemplateOf!(Unqual!T);
582         static if (is(B!'a' == CharSwitch!'a'))
583             enum isCharTester = true;
584         else
585             enum isCharTester = false;
586     }
587 }
588 
589 unittest
590 {
591     alias B = TemplateOf!(Unqual!(typeof(whiteChars)));
592     static assert (is(B!'a' == CharSwitch!'a'));
593 }
594 
595 
596 /**
597  * Returns the next word in the range passed as argument.
598  *
599  * Params: 
600  *      range = A character input range. The range is consumed for each word.
601  *      charTester = Defines the valid characters to make a word.
602  *
603  * Returns:
604  *      A string containing the word. If the result length is null then the
605  *      range parameter has not been consumed.
606  */
607 auto nextWord(Range, T, bool until = false)(ref Range range, T charTester)
608 if (isInputRange!Range && isSomeChar!(ElementType!Range) && isCharTester!T)
609 {
610     alias UT = Unqual!T; 
611     CharType!Range[] result;
612     dchar current = void;
613 
614     static if (hasInOperator!(UT, dchar))
615     {
616         while (true)
617         {
618             if (range.empty) break;
619             current = range.front;
620             
621             static if (until)
622             {
623                 if (current !in charTester)
624                 {
625                     result ~= current;
626                     range.popFront;
627                 }
628                 else break;
629             }
630             else
631             {
632                 if (current in charTester)
633                 {
634                     result ~= current;
635                     range.popFront;
636                 }
637                 else break;
638             }
639         }
640     }
641     else static if (isInputRange!T)
642     {
643         while (true)
644         {
645             if (range.empty) break;
646             current = range.front;
647             
648             static if (until)
649             {
650                 if (!canFind(charTester, current))
651                 {
652                     result ~= current;
653                     range.popFront;
654                 }
655                 else break;
656             }
657             else
658             {
659                 if (canFind(charTester, current))
660                 {
661                     result ~= current;
662                     range.popFront;
663                 }
664                 else break;
665             }
666         }
667     }
668     else static if (isSomeFunction!UT)
669     {
670         while (true)
671         {
672             if (range.empty) break;
673             current = range.front;
674 
675             static if (until)
676             {
677                 if (!charTester(current))
678                 {
679                     result ~= current;
680                     range.popFront;
681                 }
682                 else break;
683             }
684             else
685             {
686                 if (charTester(current))
687                 {
688                     result ~= current;
689                     range.popFront;
690                 }
691                 else break;
692             }
693         }
694     }
695     else static if (isSomeChar!UT)
696     {
697         while (true)
698         {
699             if (range.empty) break;
700             current = range.front;
701 
702             static if (until)
703             {
704                 if (charTester != current)
705                 {
706                     result ~= current;
707                     range.popFront;
708                 }
709                 else break;
710             }
711             else
712             {
713                 if (charTester == current)
714                 {
715                     result ~= current;
716                     range.popFront;
717                 }
718                 else break;
719             }
720         }
721     }
722     else static assert(0, "unsupported charTester argument type in nextWord(): " ~ T.stringof);
723 
724     return result;
725 }
726 ///
727 @safe pure unittest
728 {
729     auto cs1 = "azertyuiopqsdfghjklmwxcvbn";
730     auto cs2 = " \r\n\t";
731     auto cs3 = CharRange('a','z');
732     bool[dchar] cs4 = ['\r':true, '\n': true, '\t':true, ' ':true ];
733     auto src1 = "az er
734     ty";
735     auto src2 = "az er
736     ty";
737 
738     auto w1 = nextWord(src1, cs1);
739     assert(w1 == "az");
740     nextWord(src1, cs2);
741     auto w2 = nextWord(src1, cs1);
742     assert(w2 == "er");
743     nextWord(src1, cs2);
744     auto w3 = nextWord(src1, cs1);
745     assert(w3 == "ty");
746     nextWord(src1, cs2);
747 
748     auto w11 = nextWord(src2, cs3);
749     assert(w11 == "az");
750     nextWord(src2, cs4);
751     auto w22 = nextWord(src2, cs3);
752     assert(w22 == "er");
753     nextWord(src2, cs4);
754     import std.ascii: isAlpha, isDigit;
755     assert(nextWord(src2, &isDigit) == "");
756     auto w33 = nextWord(src2, &isAlpha);
757     assert(w33 == "ty");
758 }
759 
760 
761 /**
762  * Returns the next word in the range passed as argument.
763  *
764  * Params: 
765  *      range = A character input range. The range is consumed for each word.
766  *      charTester = Defines the opposite of the valid characters to make a word.
767  *
768  * Returns:
769  *      A string containing the word. If the result length is null then the
770  *      range parameter has not been consumed.
771  */
772 auto nextWordUntil(Range, T)(ref Range range, T charTester)
773 {
774     return nextWord!(Range, T, true)(range, charTester);
775 }
776 ///
777 @safe pure unittest
778 {
779     auto src = "azertyuiop
780     sdfghjk".dup;
781     auto skp = CharRange("\r\n\t".dup);
782     auto w = nextWordUntil(src, skp);
783     assert(w == "azertyuiop");
784 }
785 
786 
787 /**
788  * Skips the next word in the range passed as argument.
789  *
790  * Params:
791  *      range = A character input range. The range is consumed for each word.
792  *      charTester = Defines the valid characters to make a word.
793  */
794 void skipWord(Range, T, bool until = false)(ref Range range, T charTester)
795 if (isInputRange!Range && isSomeChar!(ElementType!Range) && isCharTester!T)
796 {
797     alias UT = Unqual!T;
798     static if (hasInOperator!(UT, dchar))
799     {
800         while (true)
801         {
802             if (range.empty) break;
803             static if (until)
804             {
805                 if (range.front !in charTester)
806                     range.popFront;
807                 else break;
808             }
809             else
810             {
811                 if (range.front in charTester)
812                     range.popFront;
813                 else break;
814             }       
815         }
816     }
817     else static if (isInputRange!T)
818     {
819         while (true)
820         {
821             if (range.empty) break;
822             static if (until)
823             {
824                 if (!canFind(charTester, range.front))
825                     range.popFront;
826                 else break;            
827             }
828             else
829             {
830                 if (canFind(charTester, range.front))
831                     range.popFront;
832                 else break;
833             }
834         }
835     }
836     else static if (isSomeFunction!UT)
837     {
838         while (true)
839         {
840             if (range.empty) break;
841             static if (until)
842             {
843                 if (!charTester(range.front))
844                     range.popFront;
845                 else break;            
846             }
847             else
848             {                        
849                 if (charTester(range.front))
850                     range.popFront;
851                 else break;
852             }
853         }
854     }
855     else static if (isSomeChar!UT)
856     {
857         while (true)
858         {
859             if (range.empty) break;
860             static if (until)
861             {
862                 if (charTester != range.front)
863                     range.popFront;
864                 else break;
865             }
866             else
867             {
868                 if (charTester == range.front)
869                     range.popFront;
870                 else break;
871             }
872         }
873     }
874     else static assert(0, "unsupported charTester argument type in skipWord(): " ~ T.stringof);
875 }
876 ///
877 @safe pure unittest
878 {
879     auto src1 = "\t\t\r\ndd";
880     auto skp1 = CharRange("\r\n\t");
881     skipWord(src1, skp1);
882     assert(src1 == "dd");
883     import std.ascii: isWhite;
884     auto src2 = "\t\t\r\nee";
885     skipWord(src2, &isWhite);
886     assert(src2 == "ee");
887 }
888 
889 
890 /**
891  * Skips the next word in the range passed as argument.
892  *
893  * Params:
894  *      range = A character input range. The range is consumed for each word.
895  *      charTester = Defines the opposite of the valid characters to make a word.
896  */
897 void skipWordUntil(Range, T)(ref Range range, T charTester)
898 {
899     skipWord!(Range, T, true)(range, charTester);
900 }
901 ///
902 @safe pure unittest
903 {
904     auto src = "dd\r";
905     auto skp = CharRange("\r\n\t");
906     skipWordUntil(src, skp);
907     assert(src == "\r");
908 }
909 
910 
911 /**
912  * Tries to make a fixed length slice by consuming range.
913  *
914  * Params:
915  *      range = A character input range. The range is consumed for each word.
916  *      len = An integral value.
917  *
918  * Returns:
919  *      At the tail a string whose length is less or equal to $(I len), otherwise
920  *      always a string of length $(I len).
921  */
922 auto nextSlice(Range, T)(ref Range range, T len)
923 if (isInputRange!Range && isSomeChar!(ElementType!Range) && isIntegral!T)
924 {
925     CharType!Range[] result;
926     size_t cnt;
927     while (true)
928     {
929         if (cnt == len || range.empty)
930             break;
931         result ~= range.front;
932         range.popFront;
933         ++cnt;
934     }   
935     
936     return result;
937 }
938 ///
939 @safe pure unittest
940 {
941     auto text0 = "012"; 
942     assert(text0.nextSlice(2) == "01");
943     auto text1 = "3";
944     assert(text1.nextSlice(8) == "3");
945     auto text2 = "45";
946     assert(text2.nextSlice(0) == "");
947     assert(text1.nextSlice(12_34_56) == "");
948     auto ut = "é_é";
949     assert(ut.nextSlice(3) == "é_é");
950 }
951 
952 
953 /**
954  * Returns true if a string starts with a particular sub string.
955  *
956  * Params:
957  *      range: A character input range. The range is not consumed.
958  *      stuff: the sub string, also works with single chars.
959  */
960 bool canRead(Range, Stuff)(ref Range range, Stuff stuff)
961 if (isInputRange!Range && isSomeChar!(ElementType!Range)
962     && (isSomeChar!Stuff || isSomeString!Stuff))
963 {
964     static if (isSomeString!Range)
965     {
966         if (range.empty)
967             return false;
968         else
969         {
970             static if (isSomeChar!Stuff)
971                 return range.front == stuff;
972             else
973             {
974                 import std.conv: to;
975                 auto dstuff = to!dstring(stuff);
976                 auto reader = ArrayRange!(ElementEncodingType!Range)(range);
977                 auto slice = reader.nextSlice(dstuff.walkLength);
978                 return dstuff == slice;
979             }
980         }
981     } 
982     else
983     {
984         import std.algorithm.searching: startsWith;
985         return startsWith(range, stuff);
986     } 
987 }
988 ///
989 pure unittest
990 {
991     auto text0 = "{0}".dup;
992     assert(text0.canRead('{'));
993     auto text1 = "(* bla *)".dup;
994     assert(text1.canRead("(*"));
995     assert(text1 == "(* bla *)");
996     string text2 = "0x123456";
997     assert(!text2.canRead("0b"));
998 }
999 //------------------------------------------------------------------------------
1000 // Text scanning utilities ----------------------------------------------------+
1001 
1002 /**
1003  * Returns an input range consisting of the input argument sliced by group of 
1004  * length len.
1005  */
1006 auto bySlice(Range)(auto ref Range range, size_t len)
1007 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1008 {
1009     struct BySlice
1010     {
1011         private bool _emptyLine;
1012         private CharType!Range[] _front;
1013         ///
1014         void popFront()
1015         {
1016             _front = nextSlice(range, len);
1017         }
1018         ///
1019         auto front()
1020         {
1021             return _front;
1022         }
1023         ///
1024         bool empty()
1025         {
1026             return _front.length == 0;
1027         }
1028     }
1029     BySlice bs;
1030     if (!range.empty)
1031         bs.popFront;
1032     return bs;
1033 }
1034 ///
1035 @safe pure unittest
1036 {
1037     auto text = "AABBCCDD";
1038     assert(text.bySlice(2).array == ["AA","BB","CC","DD"]);
1039     auto str = "AAE";
1040     assert(str.bySlice(2).array == ["AA","E"]);
1041 }
1042 
1043 
1044 /**
1045  * Tries to read immediatly an EOL in range and returns it.
1046  */
1047 auto readEol(Range)(ref Range range)
1048 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1049 {
1050     CharType!Range[] result;
1051     if (range.canRead("\r\n")) result = range.nextSlice(2);
1052     else if (range.canRead('\n')) result = range.nextSlice(1);
1053     return result;
1054 }
1055 ///
1056 pure unittest
1057 {
1058     auto text0 = "";
1059     assert(readEol(text0) == "");
1060     auto text1 = " ";
1061     assert(readEol(text1) == "");
1062     auto text2 = "\n";
1063     assert(readEol(text2) == "\n");
1064     auto text3 = "\r\n";
1065     assert(readEol(text3) == "\r\n");
1066 }
1067 
1068 
1069 /**
1070  * Tries to skip immediatly an EOL in range.
1071  */
1072 void skipEol(Range)(ref Range range)
1073 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1074 {
1075     if (range.canRead("\r\n")) range.nextSlice(2);
1076     else if (range.canRead('\n')) range.nextSlice(1);        
1077 }
1078 ///
1079 pure unittest
1080 {
1081     auto text0 = "";
1082     skipEol(text0);
1083     assert(text0 == "");
1084     auto text1 = " ";
1085     skipEol(text1);
1086     assert(text1 == " ");
1087     auto text2 = "\n";
1088     skipEol(text2);
1089     assert(text2 == "");
1090     auto text3 = "\r\na";
1091     skipEol(text3);
1092     assert(text3 == "a");
1093 }
1094 
1095 
1096 /**
1097  * Returns the next line within range.
1098  */
1099 auto nextLine(bool keepTerminator = false, Range)(ref Range range)
1100 {
1101     auto result = nextWordUntil(range, "\r\n");
1102     static if (keepTerminator) result ~= range.readEol;
1103     else range.skipEol;
1104     return result;
1105 }
1106 ///
1107 pure unittest
1108 {
1109     auto text = "123456\r\n12345\n1234\r\n123\r\n12\r\n1";
1110     assert(nextLine!false(text) == "123456");
1111     assert(nextLine!false(text) == "12345");
1112     assert(nextLine!false(text) == "1234");
1113     assert(nextLine!false(text) == "123");
1114     assert(nextLine!false(text) == "12");
1115     assert(nextLine!false(text) == "1");
1116     assert(nextLine!false(text) == "");
1117     assert(nextLine!false(text) == "");
1118 }
1119 
1120 
1121 /**
1122  * Returns an input range consisting of each line in the input argument
1123  */
1124 auto byLine(Range)(auto ref Range range)
1125 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1126 { 
1127     struct ByLine
1128     {
1129         private bool _emptyLine;
1130         private CharType!Range[] _front, _strippedfront;
1131         ///
1132         void popFront()
1133         {
1134             _front = nextLine!true(range);
1135             import std..string: stripRight;
1136             _strippedfront = stripRight(_front);
1137         }
1138         ///
1139         auto front()
1140         {
1141             return _strippedfront;
1142         }
1143         ///
1144         bool empty()
1145         {
1146             return _front.length == 0;
1147         }
1148     }
1149     ByLine bl;
1150     if (!range.empty)
1151         bl.popFront;
1152     return bl;
1153 }
1154 ///
1155 pure unittest
1156 {
1157     auto text = "aw\r\nyess";
1158     auto range = text.byLine;
1159     assert(range.front == "aw");
1160     range.popFront;
1161     assert(range.front == "yess");
1162     auto nums = "0\n1\n2\n3\n4\n5\n6\n7\n8\n9";
1163     import std.algorithm.iteration: reduce;
1164     assert(nums.byLine.reduce!((a,b) => a ~ b) == "0123456789");
1165 }
1166 
1167 
1168 /**
1169  * Returns the lines count within the input range.
1170  * The input range is not consumed.
1171  */
1172 size_t lineCount(Range)(Range range)
1173 {
1174     return range.byLine.array.length;
1175 }
1176 ///
1177 pure unittest
1178 {
1179     auto text1= "";
1180     assert(text1.lineCount == 0);
1181     auto text2 = "\n\r\n";
1182     assert(text2.lineCount == 2);
1183     auto text3 = "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n\n\n";
1184     assert(text3.lineCount == 12);
1185 }
1186 
1187 
1188 /**
1189  * Returns the next word within range. 
1190  * Words are spliited using the White characters, which are never included.
1191  */
1192 auto nextWord(Range)(ref Range range) pure @safe
1193 {
1194     skipWord(range, whiteChars);
1195     return nextWordUntil(range, whiteChars);
1196 }
1197 ///
1198 @safe pure unittest
1199 {
1200     auto text = " lorem ipsum 123456";
1201     assert(text.nextWord == "lorem");
1202     assert(text.nextWord == "ipsum");
1203     assert(text.nextWord == "123456");
1204 }
1205 
1206 
1207 /**
1208  * Returns an input range consisting of each non-blank word in the input argument.
1209  */
1210 auto byWord(Range)(auto ref Range range)
1211 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1212 { 
1213     struct ByWord
1214     {
1215         private CharType!Range[] _front;
1216         ///
1217         void popFront()
1218         {
1219             _front = nextWord(range);
1220         }
1221         ///
1222         auto front()
1223         {
1224             return _front;
1225         }
1226         ///
1227         bool empty()
1228         {
1229             return _front.length == 0;
1230         }
1231     }
1232     ByWord bw;
1233     if (!range.empty)
1234         bw.popFront;
1235     return bw;
1236 }
1237 ///
1238 @safe pure unittest
1239 {
1240     auto text = "aw yess, this is so cool";
1241     auto range = text.byWord;
1242     assert(range.front == "aw");
1243     range.popFront;
1244     assert(range.front == "yess,");
1245     range.popFront;
1246     assert(range.front == "this");
1247     auto nums = "0 1 2 3 4 5 6 7 8 9";
1248     import std.algorithm.iteration: reduce;
1249     assert(nums.byWord.reduce!((a,b) => a ~ b) == "0123456789");
1250 }
1251 
1252 
1253 /**
1254  * Returns the word count within the input range.
1255  * Words are separatedd by ascii whites. input range is not consumed.
1256  */
1257 size_t wordCount(Range)(Range range)
1258 {
1259     return range.byWord.array.length;
1260 }
1261 ///
1262 @safe pure unittest
1263 {
1264     auto text = "1 2 3 4 5 6 7 8 9 \n 10";
1265     assert(text.wordCount == 10);
1266     assert(text == "1 2 3 4 5 6 7 8 9 \n 10");
1267 }
1268 
1269 
1270 /**
1271  * Returns the next separated word.
1272  * Separators are always removed, white characters optionally.
1273  */
1274 auto nextSeparated(Range, Separators, bool strip = true)(auto ref Range range, Separators sep)
1275 {
1276     auto result = nextWordUntil(range, sep);
1277     if (!range.empty) range.popFront;
1278     static if (strip)
1279     {
1280         skipWord(result, whiteChars);
1281         result = nextWordUntil(result, whiteChars);
1282     }
1283     return result;
1284 }
1285 ///
1286 @safe pure unittest
1287 {
1288     auto seps = CharMap[',', '\n'];
1289     auto text = "name, âge \n Douglas, 27 \n Sophia 26";
1290     assert(text.nextSeparated(seps) == "name");
1291     assert(text.nextSeparated(seps) == "âge");
1292     assert(text.nextSeparated(seps) == "Douglas");
1293     assert(text.nextSeparated(seps) == "27");
1294 }
1295 
1296 
1297 /**
1298  * Returns an input range consisting of each separated word in the input argument
1299  */
1300 auto bySeparated(Range, Separators, bool strip = true)(auto ref Range range, Separators sep)
1301 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1302 {
1303     struct BySep
1304     {
1305         private CharType!Range[] _front;
1306         ///
1307         void popFront()
1308         {
1309             _front = nextSeparated!(Range, Separators, strip)(range, sep);
1310         }
1311         ///
1312         auto front()
1313         {
1314             return _front;
1315         }
1316         ///
1317         bool empty()
1318         {
1319             return _front.length == 0;
1320         }
1321     }
1322     BySep bs;
1323     if (!range.empty)
1324         bs.popFront;
1325     return bs;
1326 }
1327 ///
1328 @safe pure unittest
1329 {
1330     auto text = "name = Douglas \n age =27 \n";
1331     auto range = text.bySeparated(CharMap['=', '\n']);
1332     assert(range.front == "name");
1333     range.popFront;
1334     assert(range.front == "Douglas");
1335     range.popFront;
1336     assert(range.front == "age");
1337     range.popFront;
1338     assert(range.front == "27");
1339     range.popFront;
1340     assert(range.empty);
1341 }
1342 
1343 /**
1344  * Immediatly reads a decimal number.
1345  */
1346 auto readDecNumber(Range)(auto ref Range range)
1347 {
1348     return range.nextWord(decimalChars);
1349 }
1350 ///
1351 @safe pure unittest
1352 {
1353     auto text = "0123456 789";
1354     assert(text.readDecNumber == "0123456");
1355     text.popFront;
1356     assert(text.readDecNumber == "789");
1357     
1358     string t = "456";
1359     if (auto num = readDecNumber(t))
1360         assert (num == "456");
1361 }
1362 
1363 
1364 /**
1365  * Immediatly reads an hexadecimal number.
1366  */
1367 auto readHexNumber(Range)(auto ref Range range)
1368 {
1369     return range.nextWord(hexChars);
1370 }
1371 ///
1372 @safe pure unittest
1373 {
1374     auto text1 = "1a2B3C o";
1375     assert(text1.readHexNumber == "1a2B3C");
1376     assert(text1 == " o");
1377     auto text2 = "A897F2f2Ff2fF3c6C9c9Cc9cC9c123 o";
1378     assert(text2.readHexNumber == "A897F2f2Ff2fF3c6C9c9Cc9cC9c123");
1379     assert(text2 == " o");
1380 }
1381 
1382 
1383 /**
1384  * Strips leading white characters.
1385  */
1386 void stripLeftWhites(Range)(auto ref Range range)
1387 {
1388     range.skipWord(whiteChars);
1389 }
1390 ///
1391 pure unittest
1392 {
1393     auto text = "  \n\r\v bla".dup;
1394     auto rng = ArrayRange!char(text);
1395     rng.stripLeftWhites;
1396     assert(rng.array == "bla");
1397 }
1398 
1399 
1400 /**
1401  * Escapes characters in the input text.
1402  *
1403  * Params:
1404  *      range = The character range to process. The source is not consumed.
1405  *      pairs = An array of pair. Each pair (char[2]) defines a source and a
1406  *      target character. The slash is automatically escaped and must not be
1407  *      included in the array.
1408  * Returns:
1409  *      An array of character whose type matches the range element type.
1410  */
1411 auto escape(Range)(Range range, const char[2][] pairs)
1412 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1413 in
1414 {
1415     foreach(pair; pairs)
1416     {
1417         assert(pair[0] != '\\', "the backslash should not be set as pair");
1418         assert(pair[1] != '\\', "the backslash should not be set as pair");
1419     }
1420 }
1421 body
1422 {
1423     CharType!Range[] result;
1424     dchar front;
1425     bool done = void, wasSlash = void;
1426     while (!range.empty)
1427     {
1428         wasSlash = front == '\\';
1429         front = range.front;
1430         done = false;
1431         foreach(pair; pairs) if (front == pair[0] && !wasSlash)
1432         {
1433             done = true;
1434             result ~= `\` ~ pair[1];
1435             range.popFront;
1436             break;
1437         }
1438         if (front == '\\')
1439             result ~= front;
1440         if (!done)
1441         {
1442             result ~= front;
1443             range.popFront;
1444         }
1445     }
1446     return result;
1447 }
1448 ///
1449 @safe pure unittest
1450 {
1451     assert(`1"`.escape([['"','"']]) == `1\"`);
1452     assert(`1"1"11"1`.escape([['"','"']]) == `1\"1\"11\"1`);
1453     assert("\n\"1".escape([['"','"'],['\n','n']]) == `\n\"1`);
1454     assert(`1\"`.escape([['"','"']]) == `1\\"`);
1455     assert(`\`.escape([]) == `\\`);
1456 }
1457 
1458 
1459 /**
1460  * Un-escapes characters in the input text.
1461  *
1462  * Params:
1463  *      range = The character range to process. The source is not consumed.
1464  *      pairs = An array of pair. Each pair (char[2]) defines a target and a
1465  *      source character. The slash is automatically unescaped and must not be
1466  *      included in the array.
1467  * Returns:
1468  *      An array of character whose type matches the range element type.
1469  *      Even if invalid, any unterminated sequence located at the end of the
1470  *      range is appended to the result.
1471  */
1472 auto unEscape(Range)(Range range, const char[2][] pairs)
1473 if (isInputRange!Range && isSomeChar!(ElementType!Range))
1474 in
1475 {
1476     foreach(pair; pairs)
1477     {
1478         assert(pair[0] != '\\', "the backslash should not be set as pair");
1479         assert(pair[1] != '\\', "the backslash should not be set as pair");
1480     }
1481 }
1482 body
1483 {
1484     CharType!Range[] result;
1485     dchar front = void;
1486     bool slash;
1487     while(!range.empty)
1488     {
1489         front = range.front;
1490         if (slash && front == '\\')
1491         {
1492             result ~= '\\';
1493             slash = false;
1494             range.popFront;
1495             continue;
1496         }
1497         if (front == '\\')
1498         {
1499             slash = true;
1500             range.popFront;
1501             if (range.empty)
1502                 result ~= '\\';
1503             continue;
1504         }
1505         if (slash)
1506         {
1507             foreach(pair; pairs) if (front == pair[1])
1508             {
1509                 result ~= pair[0];
1510                 slash = false;
1511                 break;
1512             }
1513             if (slash) result ~= '\\';
1514             slash = false;
1515         }
1516         else result ~= front;
1517         range.popFront;
1518     }
1519     return result;
1520 }
1521 ///
1522 @safe pure unittest
1523 {
1524     assert( `1\"`.unEscape([['"','"']]) == `1"`);
1525     assert(`1\"1\"11\"1`.unEscape([['"','"']]) == `1"1"11"1`);
1526     assert(`\n\"1`.unEscape([['"','"'],['\n','n']]) == "\n\"1");
1527     assert(`\\\\`.unEscape([]) == `\\`);
1528     assert(`\\`.unEscape([]) == `\`);
1529     assert(`\`.unEscape([]) == `\`);
1530 }
1531 
1532 //------------------------------------------------------------------------------
1533 // Misc string-related stuff --------------------------------------------------+
1534 
1535 /**
1536  * The suffix array is a data structure that can be used to test quickly
1537  * the presence of a value in a list. It's also adapted to get completion
1538  * proposals for a prefix.
1539  *
1540  * The performance gain over a canFind() is excellent (98%). The gain over the
1541  * built-in associative array is slight (3%) and the gain over EMSI hash map is
1542  * good (40%). Despite of its speed, it always wastes a lot of memory because
1543  * each byte in an entry is represented by an array of 256 pointers.
1544  * For example count 800 MB for /usr/share/dict/words (300K words) on X86_64.
1545  *
1546  * This implementation only works with arrays of character made of single byte
1547  * units (char[], string, const(char)[], etc) but is compatible with multi byte
1548  * characters.
1549  */
1550 struct SuffixArray(T)
1551 if ((ElementEncodingType!T).sizeof == 1)
1552 {
1553 
1554 private:
1555 
1556     import iz.memory: construct, destruct;
1557     import std.experimental.allocator: make, dispose;
1558 
1559     static if (isSomeString!T)
1560         alias TT = ElementEncodingType!T;
1561     else
1562         alias TT = typeof(T.init[0]);
1563 
1564     alias Nodes = Node*[256];
1565 
1566     Node _root;
1567 
1568 public:
1569 
1570     /// A node in the array.
1571     struct Node
1572     {
1573 
1574     private:
1575 
1576         bool _terminates;
1577         immutable ubyte _index;
1578         Nodes _nodes;
1579 
1580         /// To be private (emplace)
1581         public this(const ubyte index) pure nothrow @safe @nogc
1582         {
1583             _index = index;
1584         }
1585 
1586     public:
1587 
1588         /**
1589          * Adds a suffix to the prefix represented by this node.
1590          */
1591         void addSuffix(const T suffix) nothrow @safe @nogc
1592         {
1593             //TODO-csuffixarray: value returned by find() doesnt allow addSuffix without casting const away.
1594             if (!suffix.length)
1595                 _terminates = true;
1596             else
1597             {
1598                 ubyte newIndex = suffix[0];
1599                 if (!_nodes[newIndex])
1600                     _nodes[newIndex] = construct!Node(newIndex);
1601                 _nodes[newIndex].addSuffix(suffix[1..$]);
1602             }
1603         }
1604 
1605         /**
1606          * Returns true if this node represents a full entry.
1607          */
1608         bool terminates() const pure nothrow @safe @nogc
1609         {
1610             return _terminates;
1611         }
1612 
1613         /**
1614          * Finds a full suffix from this node.
1615          */
1616         const(Node)* opBinaryRight(string op : "in")(const T suffix) const pure nothrow @safe @nogc
1617         {
1618             if (suffix.length == 0)
1619             {
1620                 if (_terminates)
1621                     return &this;
1622                 else
1623                     return null;
1624 
1625             }
1626             else if (_nodes[suffix[0]] == null)
1627                 return null;
1628             else
1629                 return suffix[1..$] in *(_nodes[suffix[0]]);
1630         }
1631 
1632         /// Aliases to the $(D in) operator.
1633         alias find = opBinaryRight!"in";
1634 
1635         /**
1636          * Finds a prefix from this node.
1637          */
1638         const(Node)* findPrefix(const T value) const pure nothrow @safe @nogc
1639         {
1640             if (!value.length)
1641                 return null;
1642 
1643             const(Node)* result = &this;
1644             foreach (i; 0 .. value.length)
1645             {
1646                 result = result._nodes[value[i]];
1647                 if (!result)
1648                     break;
1649             }
1650             return result;
1651         }
1652 
1653         //TODO-csuffixarray: conditionalVisit() that allows to break
1654 
1655         /// see SuffixArray.visitAll.
1656         void visit(alias fun, bool descending = false, bool childrenFirst = true, A...)
1657             (ref ubyte[] path, auto ref A a) const nothrow @safe
1658         if (isValidVisitor!fun)
1659         {
1660             path ~= _index;
1661             scope (exit) path.length -= 1;
1662 
1663             static if (!childrenFirst)
1664                 fun(&this, path, a);
1665 
1666             static if (descending)
1667             {
1668                 foreach_reverse (ubyte i; 0..256)
1669                     if (_nodes[i])
1670                         _nodes[i].visit!(fun, descending, childrenFirst)(path, a);
1671             }
1672             else
1673             {
1674                 foreach (ubyte i; 0..256)
1675                     if (_nodes[i])
1676                         _nodes[i].visit!(fun, descending, childrenFirst)(path, a);
1677             }
1678 
1679             static if (childrenFirst)
1680                 fun(&this, path, a);
1681         }
1682 
1683         /**
1684          * Returns the terminated entries that begin from this node.
1685          * In the results, an empty value means that this node terminates a word.
1686          */
1687         T[] entries() const nothrow @safe
1688         {
1689             nothrow @trusted
1690             static void fun(const(Node)* node, ref const ubyte[] path, ref T[] results)
1691             {
1692                 if (node._terminates)
1693                     results ~= (cast(T) path)[1..$];
1694             }
1695 
1696             T[] results;
1697             ubyte[] path;
1698             visit!(fun, false, true)(path, results);
1699             return results;
1700         }
1701     }
1702 
1703     @disable this();
1704     @disable this(this);
1705 
1706     /**
1707      * Constructs the array from a range of elements.
1708      */
1709     this(E)(E entries)
1710     if (isInputRange!E && isImplicitlyConvertible!(ElementType!E,T))
1711     {
1712         clear;
1713         static if (isArray!E)
1714         {
1715             foreach (ref entry; entries)
1716             {
1717                 if (!entry.length)
1718                     continue;
1719                 _root.addSuffix(entry);
1720             }
1721         }
1722         else foreach (entry; entries)
1723         {
1724             if (!entry.length)
1725                 continue;
1726             _root.addSuffix(entry);
1727         }
1728     }
1729 
1730     ~this()
1731     {
1732         clear;
1733     }
1734 
1735     /**
1736      * Empties the array.
1737      */
1738     void clear() @safe @nogc
1739     {
1740         void clearNode(ref Node* node) @trusted @nogc
1741         {
1742             if (!node)
1743                 return;
1744             foreach (ubyte i; 0..256)
1745                 clearNode(node._nodes[i]);
1746             destruct(node);
1747             node = null;
1748         }
1749         foreach (ubyte i; 0..256)
1750             clearNode(_root._nodes[i]);
1751     }
1752 
1753     /**
1754      * Determines wether a full value is in the array.
1755      *
1756      * Params:
1757      *      value = The value to search for.
1758      * Returns:
1759      *      Null if value is not in the array otherwise a pointer to
1760      *      the node that terminates the path to value.
1761      */
1762     const(Node)* opBinaryRight(string op : "in")(const T value) const pure nothrow @safe @nogc
1763     {
1764         if (!value.length)
1765             return null;
1766         else
1767             return value in _root;
1768     }
1769 
1770     /// ditto
1771     alias find = opBinaryRight!"in";
1772 
1773     /**
1774      * Determines wether an entry starts with value.
1775      *
1776      * Params:
1777      *      value = The prefix to search for.
1778      * Returns:
1779      *      Null if value is not in the array otherwise a pointer to
1780      *      the node that gives the entries starting with value.
1781      */
1782     const(Node)* findPrefix(const T value) const pure nothrow @safe @nogc
1783     {
1784         if (!value.length)
1785             return null;
1786         else
1787             return _root.findPrefix(value);
1788     }
1789 
1790     /**
1791      * Prototype for the function passed in visitAll and Node.visit.
1792      *
1793      * Params:
1794      *      node = The node that's visited.
1795      *      path = The path that leads to the node. It also represents the value.
1796      *      a = The variadic parameters, i.e the callback "user parameters".
1797      */
1798     alias Fun(A...) = void function(const(Node)* node, ref const ubyte[] path, A a);
1799 
1800     /// Indicates wether a function is suitable for visitAll() or Node.visit()
1801     template isValidVisitor(alias fun)
1802     {
1803         static if (!is(fun))
1804             alias F = typeof(fun);
1805         else
1806             alias F = fun;
1807 
1808         enum isValidVisitor =
1809             (isCallable!F) &&
1810             (Parameters!F).length >= 2 &&
1811             is(Parameters!F[0] == Parameters!(Fun!())[0]) &&
1812             is(Parameters!F[1] == Parameters!(Fun!())[1]) &&
1813             ParameterStorageClassTuple!F[1] == ParameterStorageClass.ref_;
1814     }
1815 
1816     /**
1817      * Visits all the nodes with a function.
1818      *
1819      * Params:
1820      *      fun = See the Fun prototype.
1821      *      descending = Indicates if the visit starts from the end.
1822      *      childrenFirst = Indicates if the children are visited before their parent.
1823      *      a = The variadic parameters passed to fun.
1824      */
1825     void visitAll(alias fun, bool descending = false,
1826         bool childrenFirst = true, A...)(auto ref A a) const nothrow @safe
1827     if (isValidVisitor!fun)
1828     {
1829         ubyte[] path;
1830         _root.visit!(fun, descending, childrenFirst)(path, a);
1831     }
1832 
1833     /**
1834      * Sorts the entries.
1835      *
1836      * While sorting can be easily done with suffix trees this is much slower
1837      * than a classic quick sort. Also note that sorting is not unicode aware
1838      * which means that unless the entries are made of ASCII chars, the results
1839      * will be different from std.algorithm sort().
1840      *
1841      * Params:
1842      *      descending = Defines the sorting direction.
1843      * Returns:
1844      *      An array of entries.
1845      */
1846     T[] sort(bool descending = false) nothrow @safe
1847     {
1848         nothrow @trusted
1849         static void fun(const(Node)* node, ref const ubyte[] path, ref T[] results)
1850         {
1851             if (node.terminates)
1852                 results ~= (cast(T) path)[1..$];
1853         }
1854 
1855         T[] results;
1856         if (descending)
1857             visitAll!(fun, true, true)(results);
1858         else
1859             visitAll!(fun, false, true)(results);
1860         return results;
1861     }
1862 
1863     /**
1864      * Indicates the amount of memory used by the array.
1865      */
1866     size_t memoryUsage() const nothrow @safe
1867     {
1868         nothrow @safe
1869         static void fun(const(Node)* node, const ref ubyte[] path, ref size_t result)
1870         {
1871             result += Node.sizeof;
1872         }
1873 
1874         size_t result;
1875         visitAll!fun(result);
1876         return result;
1877     }
1878 }
1879 ///
1880 unittest
1881 {
1882     string[] source = ["Cairo", "Calcutta", "Calgary", "Cali", "Campinas",
1883         "Cape Town", "Caracas", "Casablanca", "Changchun", "Changde"];
1884 
1885     auto cities = SuffixArray!string(source);
1886 
1887     // test for presence
1888     assert("Cairo" in cities);
1889     assert("Calcutta" in cities);
1890     assert("Calgary" in cities);
1891     assert("Chicago" !in cities);
1892     assert("" !in cities);
1893 
1894     // get completion list
1895     auto pre = cities.findPrefix("Cal");
1896     assert(pre);
1897     assert(!pre.terminates);
1898     assert(pre.entries == ["Calcutta"[3..$], "Calgary"[3..$], "Cali"[3..$]]);
1899 
1900     // sorting
1901     auto desc = cities.sort(true);
1902     import std.algorithm.sorting;
1903     assert(desc == sort!"a > b"(source).array);
1904     auto asc = cities.sort(false);
1905     assert(asc == sort!"a < b"(source).array);
1906 
1907     // adds from a prefix
1908     auto ch = cast(cities.Node*) cities.findPrefix("Ch");
1909     assert(ch);
1910     ch.addSuffix("icago");
1911     assert("Chicago" in cities);
1912 
1913     // memory usage
1914     size_t count;
1915     foreach(s; source) foreach(i; 0..s.length)
1916         ++count;
1917     // actual usage is more close to count * 256 * size_t.sizeof
1918     assert(cities.memoryUsage >= count);
1919 
1920     // clearing is global
1921     cities.clear;
1922     assert("Cairo" !in cities);
1923     assert("Calcutta" !in cities);
1924 }
1925 
1926 unittest
1927 {
1928     ubyte[][] source =
1929     [
1930         [0x11,0x22,0x33],
1931         [0x11,0x33],
1932         [0x1,0x2,0x3]
1933     ];
1934     SuffixArray!(ubyte[]) sar = SuffixArray!(ubyte[])(source);
1935 
1936     assert(sar.find(source[0]));
1937     assert(sar.find(source[1]));
1938     assert(sar.find(source[2]));
1939 
1940     ubyte[] notin = [0x11,0x22,0x34];
1941     assert(!sar.find(notin));
1942     assert(sar.findPrefix([0x76]) is null);
1943     assert(sar.findPrefix([]) is null);
1944     assert(sar.findPrefix([0x11]).findPrefix([0x22]).findPrefix([]) is null);
1945     assert(sar.findPrefix([0x11]).find([0x22,0x99]) is null);
1946     assert(sar.findPrefix([0x11]).find([]) is null);
1947 }
1948 
1949 //TODO-csuffixarray: choose the right version
1950 struct SuffixArrayTemp(T)
1951 if ((ElementEncodingType!T).sizeof == 1)
1952 {
1953 
1954 private:
1955 
1956     import iz.memory: construct, destruct, getMem, freeMem;
1957 
1958     static if (isSomeString!T)
1959         alias TT = ElementEncodingType!T;
1960     else
1961         alias TT = typeof(T.init[0]);
1962 
1963     alias Nodes = Node*[256];
1964 
1965     Node _root;
1966 
1967 
1968 public:
1969 
1970     struct Node
1971     {
1972 
1973     private:
1974 
1975         bool _terminates;
1976         immutable ubyte _index;
1977         Nodes* _nodes;
1978 
1979         public this(const ubyte index) pure nothrow @safe @nogc
1980         {
1981             _index = index;
1982         }
1983 
1984     public:
1985 
1986         ~this() @nogc
1987         {
1988             if (_nodes)
1989                 freeMem(cast(void*)_nodes);
1990             _nodes = null;
1991         }
1992 
1993 
1994         void addSuffix(const T suffix) nothrow @trusted @nogc
1995         {
1996             if (!suffix.length)
1997                 _terminates = true;
1998             else
1999             {
2000                 ubyte newIndex = suffix[0];
2001                 if (!_nodes)
2002                 {
2003                     _nodes = cast(Nodes*) getMem(256 * size_t.sizeof);
2004                     (*_nodes)[] = null;
2005                 }
2006                 if (!(*_nodes)[newIndex])
2007                     (*_nodes)[newIndex] = construct!Node(newIndex);
2008                 (*_nodes)[newIndex].addSuffix(suffix[1..$]);
2009             }
2010         }
2011 
2012 
2013         bool terminates() const pure nothrow @safe @nogc
2014         {
2015             return _terminates;
2016         }
2017 
2018 
2019         const(Node)* opBinaryRight(string op = "in")(const T suffix) const pure nothrow @trusted @nogc
2020         if (op == "in")
2021         {
2022             if (suffix.length == 0)
2023             {
2024                 if (_terminates)
2025                     return &this;
2026                 else
2027                     return null;
2028 
2029             }
2030             else if (!_nodes)
2031                 return null;
2032             else if ((*_nodes)[suffix[0]] == null)
2033                 return null;
2034             else
2035                 return (*_nodes)[suffix[0]].find(suffix[1..$]);
2036         }
2037 
2038         alias find = opBinaryRight!"in";
2039 
2040         const(Node)* findPrefix(const T value) const pure nothrow @trusted @nogc
2041         {
2042             if (!value.length)
2043                 return null;
2044 
2045             const(Node)* result = &this;
2046             if (result._nodes) foreach (i; 0 .. value.length)
2047             {
2048                 result = (*result._nodes)[value[i]];
2049                 if (!result)
2050                     break;
2051             }
2052             return result;
2053         }
2054 
2055         void visit(alias fun, bool descending = false, bool childrenFirst = true, A...)
2056             (ref ubyte[] path, auto ref A a) const nothrow @trusted
2057         if (isValidVisitor!fun)
2058         {
2059             path ~= _index;
2060             scope (exit) path.length -= 1;
2061 
2062             static if (!childrenFirst)
2063                 fun(&this, path, a);
2064 
2065             static if (descending)
2066             {
2067                 if (_nodes) foreach_reverse (ubyte i; 0..256)
2068                     if ((*_nodes)[i])
2069                         (*_nodes)[i].visit!(fun, descending, childrenFirst)(path, a);
2070             }
2071             else
2072             {
2073                 if (_nodes) foreach (ubyte i; 0..256)
2074                     if ((*_nodes)[i])
2075                         (*_nodes)[i].visit!(fun, descending, childrenFirst)(path, a);
2076             }
2077 
2078             static if (childrenFirst)
2079                 fun(&this, path, a);
2080         }
2081 
2082         T[] entries() const nothrow @safe
2083         {
2084             nothrow @trusted
2085             static void fun(const(Node)* node, ref const ubyte[] path, ref T[] results)
2086             {
2087                 if (node._terminates)
2088                     results ~= (cast(T) path)[1..$];
2089             }
2090 
2091             T[] results;
2092             ubyte[] path;
2093             visit!(fun, false, true)(path, results);
2094             return results;
2095         }
2096     }
2097 
2098     this(E)(E entries) nothrow @safe @nogc
2099     if (isInputRange!E && isImplicitlyConvertible!(ElementType!E,T))
2100     {
2101         clear;
2102         foreach (entry; entries)
2103         {
2104             if (!entry.length)
2105                 continue;
2106             _root.addSuffix(entry);
2107         }
2108     }
2109 
2110     ~this()
2111     {
2112         clear;
2113     }
2114 
2115     void clear() @safe @nogc
2116     {
2117         void clearNode(ref Node* node) @trusted @nogc
2118         {
2119             if (!node)
2120                 return;
2121             if (node._nodes) foreach (ubyte i; 0..256)
2122                 clearNode((*node._nodes)[i]);
2123             destruct(node);
2124             node = null;
2125         }
2126         if (_root._nodes)
2127         {
2128             foreach (ubyte i; 0..256)
2129                 clearNode((*_root._nodes)[i]);
2130             freeMem(cast(void*) _root._nodes);
2131             _root._nodes = null;
2132         }
2133     }
2134 
2135     const(Node)* opBinaryRight(string op = "in")(const T value) const pure nothrow @safe @nogc
2136     if (op == "in")
2137     {
2138         if (!value.length)
2139             return null;
2140         else
2141             return _root.find(value);
2142     }
2143 
2144     alias find = opBinaryRight!"in";
2145 
2146     const(Node)* findPrefix(const T value) const pure nothrow @safe @nogc
2147     {
2148         if (!value.length)
2149             return null;
2150         else
2151             return _root.findPrefix(value);
2152     }
2153 
2154     alias Fun(A...) = void function(const(Node)* node, ref const ubyte[] path, A a);
2155 
2156     template isValidVisitor(alias fun)
2157     {
2158         static if (!is(fun))
2159             alias F = typeof(fun);
2160         else
2161             alias F = fun;
2162 
2163         enum isValidVisitor =
2164             (isCallable!F) &&
2165             (Parameters!F).length >= 2 &&
2166             is(Parameters!F[0] == const(Node)*) &&
2167             is(Parameters!F[1] == const ubyte[]) &&
2168             ParameterStorageClassTuple!F[1] == ParameterStorageClass.ref_;
2169     }
2170 
2171 
2172     void visitAll(alias fun, bool descending = false,
2173         bool childrenFirst = true, A...)(auto ref A a) const nothrow @safe
2174     if (isValidVisitor!fun)
2175     {
2176         ubyte[] path;
2177         _root.visit!(fun, descending, childrenFirst)(path, a);
2178     }
2179 
2180 
2181     T[] sort(bool descending = false) nothrow @safe
2182     {
2183         nothrow @trusted
2184         static void fun(const(Node)* node, ref const ubyte[] path, ref T[] results)
2185         {
2186             if (node.terminates)
2187                 results ~= (cast(T) path)[1..$];
2188         }
2189 
2190         T[] results;
2191         if (descending)
2192             visitAll!(fun, true, true)(results);
2193         else
2194             visitAll!(fun, false, true)(results);
2195         return results;
2196     }
2197 
2198     size_t memoryUsage() const nothrow @safe
2199     {
2200         nothrow @safe
2201         static void fun(const(Node)* node, const ref ubyte[] path, ref size_t result)
2202         {
2203             result += Node.sizeof;
2204         }
2205 
2206         size_t result;
2207         visitAll!fun(result);
2208         return result;
2209     }
2210 }
2211 
2212 unittest
2213 {
2214     string[] source = ["Cairo", "Calcutta", "Calgary", "Cali", "Campinas",
2215         "Cape Town", "Caracas", "Casablanca", "Changchun", "Changde"];
2216 
2217     auto cities = SuffixArrayTemp!string(source);
2218 
2219     // test for presence
2220     assert("Cairo" in cities);
2221     assert("Calcutta" in cities);
2222     assert("Calgary" in cities);
2223     assert("Chicago" !in cities);
2224     assert("" !in cities);
2225 
2226     // get completion list
2227     auto pre = cities.findPrefix("Cal");
2228     assert(pre);
2229     assert(!pre.terminates);
2230     assert(pre.entries == ["Calcutta"[3..$], "Calgary"[3..$], "Cali"[3..$]]);
2231 
2232     // sorting
2233     auto desc = cities.sort(true);
2234     import std.algorithm.sorting;
2235     assert(desc == sort!"a > b"(source).array);
2236     auto asc = cities.sort(false);
2237     assert(asc == sort!"a < b"(source).array);
2238 
2239     // adds from a prefix
2240     auto ch = cast(cities.Node*) cities.findPrefix("Ch");
2241     assert(ch);
2242     ch.addSuffix("icago");
2243     assert("Chicago" in cities);
2244 
2245     // memory usage
2246     size_t count;
2247     foreach(s; source) foreach(i; 0..s.length)
2248         ++count;
2249     // actual usage is actually more close to count * 256 * size_t.sizeof
2250     assert(cities.memoryUsage >= count);
2251 
2252     // clearing is global
2253     cities.clear;
2254     assert("Cairo" !in cities);
2255     assert("Calcutta" !in cities);
2256 }
2257 
2258 
2259 //------------------------------------------------------------------------------
2260