1 module yu.tools.http1xparser.url;
2 
3 import yu.tools.http1xparser.default_;
4 import yu.traits;
5 
6 @trusted :
7 
8 enum URLFieldsType : ushort
9 { 
10   UF_SCHEMA           = 0 , 
11   UF_HOST             = 1 , 
12   UF_PORT             = 2 , 
13   UF_PATH             = 3 , 
14   UF_QUERY            = 4 , 
15   UF_FRAGMENT         = 5 , 
16   UF_USERINFO         = 6 , 
17   UF_MAX              = 7
18 }
19 
20 /* Result structure for httpParserURL().
21  *
22  * Callers should index into fieldData[] with UF_* values iff field_set
23  * has the relevant (1 << UF_*) bit set. As a courtesy to clients (and
24  * because we probably have padding left over), we convert any port to
25  * a uint16_t.
26  */
27 struct ParserdUrl 
28 {
29   ushort fieldSet;           /* Bitmask of (1 << UF_*) values */
30   ushort port;                /* Converted UF_PORT string */
31 
32   struct Field {
33     ushort off;               /* Offset into buffer in which field starts */
34     ushort len;               /* Length of run in buffer */
35   } 
36   Field[URLFieldsType.UF_MAX] fieldData;
37 
38   pragma(inline,true)
39   bool hasField(URLFieldsType type) nothrow @nogc
40   {
41        return (fieldSet & ( 1 << type)) > 0;
42   }
43 
44   auto getField(CHAR)(CHAR[] url, URLFieldsType type) @nogc nothrow
45   {
46       size_t max = (fieldData[type].off + fieldData[type].len);
47       if(!hasField(type) || max > url.length)
48         return (CHAR[]).init;
49       return url[fieldData[type].off .. max];
50   }
51 }
52 
53 //is_connect = true 方法将进行严格检验,如果URL中没有port、schema将导致 httpParserURL 方法失败
54 bool httpParserURL(bool strict = false, CHAR)(CHAR[] url , out  ParserdUrl u) @nogc nothrow 
55                                                                             if(isCharByte!CHAR)
56 {
57   const ubyte[] data = cast(const ubyte[])(url);
58   HTTPParserState s;
59   size_t p;
60   URLFieldsType uf, old_uf;
61   bool found_at = false;
62 
63   s = strict ? HTTPParserState.s_req_server_start : HTTPParserState.s_req_spaces_before_url;
64   old_uf = URLFieldsType.UF_MAX;
65   //import std.experimental.logger;
66   for (p = 0; p < data.length; p++) with(URLFieldsType){
67     const ubyte ch = data[p];
68     s = parseURLchar(s, ch);
69     //trace("ch == ", cast(char)ch, "    type = ", s);
70     /* Figure out the next field that we're operating on */
71     switch (s) with(HTTPParserState){
72       case s_dead:
73         return false;
74 
75       /* Skip delimeters */
76       case s_req_schema_slash:
77       case s_req_schema_slash_slash:
78       case s_req_server_start:
79       case s_req_query_string_start:
80       case s_req_fragment_start:
81         continue;
82 
83       case s_req_schema:
84         uf = UF_SCHEMA;
85         break;
86 
87       case s_req_server_with_at:
88         found_at = true;
89         goto case;
90       /* FALLTROUGH */
91       case s_req_server:
92         uf = UF_HOST;
93         break;
94 
95       case s_req_path:
96         uf = UF_PATH;
97         break;
98 
99       case s_req_query_string:
100         uf = UF_QUERY;
101         break;
102 
103       case s_req_fragment:
104         uf = UF_FRAGMENT;
105         break;
106 
107       default:
108         assert(!"Unexpected state");
109         return false;
110     }
111 
112     /* Nothing's changed; soldier on */
113     if (uf == old_uf) {
114       u.fieldData[uf].len++;
115       continue;
116     }
117 
118     u.fieldData[uf].off = cast(ushort)p;
119     u.fieldData[uf].len = 1;
120 
121     u.fieldSet |= (1 << uf);
122     old_uf = uf;
123   }
124 
125   /* host must be present if there is a schema */
126   /* parsing http:///toto will fail */
127   if (u.hasField(URLFieldsType.UF_SCHEMA) && (!u.hasField(URLFieldsType.UF_HOST))) {
128     return false;
129   }
130   if (u.hasField(URLFieldsType.UF_HOST)) {
131     if(!parserHost(data, u, found_at)) {
132       return false;
133     }
134   }
135   /* CONNECT requests can only contain "hostname:port" */
136   if (strict && u.fieldSet != ((1 << URLFieldsType.UF_HOST)|(1 << URLFieldsType.UF_PORT))) {
137     return false;
138   }
139 
140   if (u.hasField(URLFieldsType.UF_PORT)) {
141     import core.stdc.stdlib : strtoul;
142     /* Don't bother with endp; we've already validated the string */
143     const ubyte[] tdata = data[u.fieldData[URLFieldsType.UF_PORT].off..$];
144     ulong v = strtoul(cast(const char*)(tdata.ptr), null, 10);
145 
146     /* Ports have a max value of 2^16 */
147     if (v > ushort.max) return false;
148     
149     u.port = cast(ushort)v;
150   }
151 
152   return true;
153 }
154 
155 package:
156 
157 HTTPParserHostState parserHostChar(HTTPParserHostState s, ubyte ch) @nogc nothrow {
158   switch(s) with(HTTPParserHostState) 
159   {
160     case s_http_userinfo:
161     case s_http_userinfo_start:
162       if (ch == '@') {
163         return s_http_host_start;
164       }
165 
166       if (IS_USERINFO_CHAR2(ch)) {
167         return s_http_userinfo;
168       }
169       break;
170 
171     case s_http_host_start:
172       if (ch == '[') {
173         return s_http_host_v6_start;
174       }
175 
176       if (IS_HOST_CHAR(ch)) {
177         return s_http_host;
178       }
179 
180       break;
181 
182     case s_http_host:
183       if (IS_HOST_CHAR(ch)) {
184         return s_http_host;
185       }
186       goto case;
187     /* FALLTHROUGH */
188     case s_http_host_v6_end:
189       if (ch == ':') {
190         return s_http_host_port_start;
191       }
192 
193       break;
194 
195     case s_http_host_v6:
196       if (ch == ']') {
197         return s_http_host_v6_end;
198       }
199       goto case;
200     /* FALLTHROUGH */
201     case s_http_host_v6_start:
202       if (IS_HEX(ch) || ch == ':' || ch == '.') {
203         return s_http_host_v6;
204       }
205 
206       if (s == s_http_host_v6 && ch == '%') {
207         return s_http_host_v6_zone_start;
208       }
209       break;
210 
211     case s_http_host_v6_zone:
212       if (ch == ']') {
213         return s_http_host_v6_end;
214       }
215       goto case;
216     /* FALLTHROUGH */
217     case s_http_host_v6_zone_start:
218       /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
219       if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' ||
220           ch == '~') {
221         return s_http_host_v6_zone;
222       }
223       break;
224 
225     case s_http_host_port:
226     case s_http_host_port_start:
227       if (mixin(IS_NUM("ch"))) {
228         return s_http_host_port;
229       }
230       break;
231     default:
232       break;
233   }
234   return HTTPParserHostState.s_http_host_dead;
235 }
236 
237 bool parserHost(const ubyte[] data, ref ParserdUrl u, bool found_at) @nogc nothrow {
238   HTTPParserHostState s;
239 
240   size_t p;
241   size_t buflen = u.fieldData[URLFieldsType.UF_HOST].off + u.fieldData[URLFieldsType.UF_HOST].len;
242 
243   assert(u.fieldSet & (1 << URLFieldsType.UF_HOST));
244 
245   u.fieldData[URLFieldsType.UF_HOST].len = 0;
246 
247   s = found_at ? HTTPParserHostState.s_http_userinfo_start : HTTPParserHostState.s_http_host_start;
248 
249   for (p = u.fieldData[URLFieldsType.UF_HOST].off; p < buflen; p++) with (HTTPParserHostState){
250     const ubyte ch = data[p];
251     const HTTPParserHostState new_s = parserHostChar(s, ch);
252 
253     if (new_s == s_http_host_dead) {
254       return false;
255     }
256 
257     switch(new_s) with(URLFieldsType){
258       case s_http_host:
259         if (s != s_http_host) {
260           u.fieldData[UF_HOST].off = cast(ushort)p;
261         }
262         u.fieldData[UF_HOST].len++;
263         break;
264 
265       case s_http_host_v6:
266         if (s != s_http_host_v6) {
267           u.fieldData[UF_HOST].off = cast(ushort)p;
268         }
269         u.fieldData[UF_HOST].len++;
270         break;
271 
272       case s_http_host_v6_zone_start:
273       case s_http_host_v6_zone:
274         u.fieldData[UF_HOST].len++;
275         break;
276 
277       case s_http_host_port:
278         if (s != s_http_host_port) {
279           u.fieldData[UF_PORT].off = cast(ushort)p;
280           u.fieldData[UF_PORT].len = 0;
281           u.fieldSet |= (1 << UF_PORT);
282         }
283         u.fieldData[UF_PORT].len++;
284         break;
285 
286       case s_http_userinfo:
287         if (s != s_http_userinfo) {
288           u.fieldData[UF_USERINFO].off = cast(ushort)p;
289           u.fieldData[UF_USERINFO].len = 0;
290           u.fieldSet |= (1 << UF_USERINFO);
291         }
292         u.fieldData[UF_USERINFO].len++;
293         break;
294 
295       default:
296         break;
297     }
298     s = new_s;
299   }
300 
301   /* Make sure we don't end somewhere unexpected */
302   switch (s) with (HTTPParserHostState){
303     case s_http_host_start:
304     case s_http_host_v6_start:
305     case s_http_host_v6:
306     case s_http_host_v6_zone_start:
307     case s_http_host_v6_zone:
308     case s_http_host_port_start:
309     case s_http_userinfo:
310     case s_http_userinfo_start:
311       return false;
312     default:
313       break;
314   }
315 
316   return   true;
317 }
318 
319 HTTPParserState parseURLchar(HTTPParserState s, ubyte ch) @nogc nothrow
320 {
321     if (ch == ' ' || ch == '\r' || ch == '\n')
322         return HTTPParserState.s_dead;
323 
324     version (HTTP_PARSER_STRICT)
325     {
326         if (ch == '\t' || ch == '\f')
327             return s_dead;
328     }
329 
330     switch (s) with (HTTPParserState)
331     {
332     case s_req_spaces_before_url:
333         /* Proxied requests are followed by scheme of an absolute URI (alpha).
334                 * All methods except CONNECT are followed by '/' or '*'.
335                 */
336         if (ch == '/' || ch == '*')
337             return s_req_path;
338 
339         if (mixin(IS_ALPHA("ch")))
340             return s_req_schema;
341         break;
342 
343     case s_req_schema:
344         if (mixin(IS_ALPHA("ch")))
345             return s;
346 
347         if (ch == ':')
348             return s_req_schema_slash;
349         break;
350 
351     case s_req_schema_slash:
352         if (ch == '/')
353             return s_req_schema_slash_slash;
354         break;
355 
356     case s_req_schema_slash_slash:
357         if (ch == '/')
358             return s_req_server_start;
359         break;
360 
361     case s_req_server_with_at:
362         if (ch == '@')
363         {
364             return s_dead;
365         }
366         goto case;
367         /* FALLTHROUGH */
368     case s_req_server_start:
369     case s_req_server:
370         {
371             if (ch == '/')
372                 return s_req_path;
373 
374             if (ch == '?')
375                 return s_req_query_string_start;
376 
377             if (ch == '@')
378                 return s_req_server_with_at;
379 
380             if (IS_USERINFO_CHAR2(ch) || ch == '[' || ch == ']')
381                 return s_req_server;
382         }
383         break;
384 
385     case s_req_path:
386         {
387             if (mixin(IS_URL_CHAR("ch")))
388                 return s;
389 
390             switch (ch)
391             {
392             case '?':
393                 return s_req_query_string_start;
394 
395             case '#':
396                 return s_req_fragment_start;
397             default:
398                 break;
399             }
400         }
401         break;
402     case s_req_query_string_start:
403     case s_req_query_string:
404         {
405             if (mixin(IS_URL_CHAR("ch")))
406             {
407                 return s_req_query_string;
408             }
409 
410             switch (ch)
411             {
412             case '?':
413                 /* allow extra '?' in query string */
414                 return s_req_query_string;
415 
416             case '#':
417                 return s_req_fragment_start;
418             default:
419                 break;
420             }
421             break;
422         }
423 
424     case s_req_fragment_start:
425         {
426             if (mixin(IS_URL_CHAR("ch")))
427                 return s_req_fragment;
428             switch (ch)
429             {
430             case '?':
431                 return s_req_fragment;
432 
433             case '#':
434                 return s;
435             default:
436                 break;
437             }
438         }
439         break;
440     case s_req_fragment:
441         {
442             if (mixin(IS_URL_CHAR("ch")))
443                 return s;
444             switch (ch)
445             {
446             case '?':
447             case '#':
448                 return s;
449             default:
450                 break;
451             }
452         }
453         break;
454     default:
455         break;
456     }
457     /* We should never fall out of the switch above unless there's an error */
458     return HTTPParserState.s_dead;
459 }
460 
461 pragma(inline, true) bool IS_HEX(ubyte c) nothrow @nogc
462 {
463      bool sum = mixin(IS_NUM("c"));
464      c = c | 0x20; 
465      return (sum || (c >= 'a' && c <= 'f'));
466 }
467 
468 pragma(inline, true) bool IS_HOST_CHAR(ubyte c) nothrow @nogc {
469     return (IS_ALPHANUM(c) || (c) == '.' || (c) == '-');
470 }
471 
472 pragma(inline, true) bool IS_ALPHANUM(ubyte c) nothrow @nogc {
473     bool alpha = mixin(IS_ALPHA("c"));
474     bool sum = mixin(IS_NUM("c"));
475     return (sum || alpha);
476 }
477 
478 pragma(inline, true) bool IS_USERINFO_CHAR2(ubyte c) nothrow @nogc
479 {
480     bool b1 = (c == '%' || c == ';' || c == ':' || c == '&' || c == '='
481             || c == '+' || c == '$' || c == ',');
482     bool b2 = (c == '-' || '_' == c || '.' == c || '!' == c || '~' == c || '*' == c
483             || '\'' == c || '(' == c || ')' == c);
484     return (b2 || b1 || IS_ALPHANUM(c));
485 }
486 
487 pragma(inline, true)
488 void STRICT_CHECK(bool istrue)
489 {
490     if(istrue)
491         throw new Http1xParserExcetion(HTTPParserErrno.HPE_STRICT);
492 }
493 //	string IS_MARK(string c) { return "(" ~ c ~ " == '-' || " ~ c ~ " == '_' || "~ c ~ " == '.' || " ~ c ~ " == '!' || " ~ c ~ " == '~' ||  " ~ c ~ " == '*' ||  " ~ c ~ " == '\'' || " ~ c ~ " == '(' || " ~ c ~ " == ')')";}
494 string IS_NUM(string c)
495 {
496     return "(" ~ c ~ " >= '0' &&  " ~ c ~ "  <= '9')";
497 }
498 
499 string IS_ALPHA(string c)
500 {
501     return "((" ~ c ~ "| 0x20) >= 'a' && (" ~ c ~ " | 0x20) <= 'z')";
502 }
503 
504 string IS_URL_CHAR(string c)
505 {
506     return "(!!(cast(uint) (normal_url_char[cast(uint) (" ~ c
507         ~ ") >> 3] ) &                  
508 				(1 << (cast(uint)" ~ c ~ " & 7))))";
509 }
510 
511 enum NEW_MESSAGE = "httpShouldKeepAlive() ? (type == HTTPType.REQUEST ? HTTPParserState.s_start_req : HTTPParserState.s_start_res) : HTTPParserState.s_dead";
512 string CALLBACK_NOTIFY(string code)
513 {
514     string _s = " {if (_on" ~ code ~ " !is null){
515                _on" ~ code ~ "(this);  
516                if(!handleIng)
517 	                throw new Http1xParserStopExcetion(HTTPParserErrno.HPE_CB_" ~ code ~ ", p + 1);
518                 } }";
519     return _s;
520 }
521 
522 string CALLBACK_NOTIFY_NOADVANCE(string code)
523 {
524     string _s = " {if (_on" ~ code ~ " != null){
525 	               _on" ~ code ~ "(this); 
526                    if(!handleIng)
527 	                throw new Http1xParserStopExcetion(HTTPParserErrno.HPE_CB_" ~ code ~ ", p);
528                    }}";
529     return _s;
530 }
531 
532 string CALLBACK_DATA(string code)
533 {
534     string _s = "{ if( m" ~ code ~ "Mark != size_t.max && _on" ~ code
535         ~ " !is null){
536                 ulong len = (p - m" ~ code ~ "Mark) ;
537                 
538                 if(len > 0) {  
539                /* writeln(\"CALLBACK_DATA at  \",__LINE__, \"  " ~ code ~ "\");*/
540                 ubyte[]  _data =  data[m" ~ code ~ "Mark..p];
541                 _on"
542         ~ code ~ "(this,_data,true);
543                  if(!handleIng)
544 	                throw new Http1xParserStopExcetion(HTTPParserErrno.HPE_CB_"
545         ~ code ~ ", p + 1);
546                 } } m" ~ code ~ "Mark = size_t.max;}";
547     return _s;
548 }
549 
550 string CALLBACK_DATA_NOADVANCE(string code)
551 {
552     string _s = "{ if(m" ~ code ~ "Mark != size_t.max && _on" ~ code ~ " !is null){
553                 ulong len = (p - m" ~ code ~ "Mark) ;
554                 if(len > 0) {  
555                  /*writeln(\"CALLBACK_DATA_NOADVANCE at  \",__LINE__, \"  " ~ code ~ "\");*/
556                 ubyte[]  _data = data[m" ~ code
557         ~ "Mark..p];
558                 _on" ~ code ~ "(this,_data,false);
559                  if(!handleIng)
560 	                throw new Http1xParserStopExcetion(HTTPParserErrno.HPE_CB_" ~ code ~ ", p);
561                 }}m" ~ code
562         ~ "Mark = size_t.max;}";
563     return _s;
564 }
565 
566 @nogc nothrow unittest{
567     string testUrl1 = "http://aa:werwer@www.hostname.co:8086/test?a=b#dadsas";
568     ParserdUrl url;
569     assert(httpParserURL(testUrl1,url));
570     assert(url.hasField(URLFieldsType.UF_SCHEMA));
571     assert(url.hasField(URLFieldsType.UF_HOST));
572     string host =  url.getField(testUrl1,URLFieldsType.UF_HOST);
573     assert(host == "www.hostname.co");
574     assert(url.port == 8086);
575     assert(url.hasField(URLFieldsType.UF_FRAGMENT));
576     string str = url.getField(testUrl1,URLFieldsType.UF_FRAGMENT);
577     assert(str == "dadsas");
578     str = url.getField(testUrl1,URLFieldsType.UF_QUERY);
579     assert(str == "a=b" );
580     str = url.getField(testUrl1,URLFieldsType.UF_USERINFO);
581     assert(str == "aa:werwer" );
582 }