]> git.saurik.com Git - apple/security.git/blob - SecurityTests/clxutils/urlPageGrab/urlPageGrab.cpp
Security-57031.10.10.tar.gz
[apple/security.git] / SecurityTests / clxutils / urlPageGrab / urlPageGrab.cpp
1 /*
2 * urlPageGrab - download a page and all of the image sources referenced on
3 * that page.
4 */
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <security_utilities/threading.h>
8 #include <Carbon/Carbon.h>
9 #include <time.h>
10 #include <string.h>
11 #include <pthread.h>
12 #include "cfSimpleGet.h"
13
14 #define MAX_PATH_LEN 256
15 #define MAX_URL_LEN 1024
16 #define MAX_THREADS 100
17
18 #define DEBUG_PRINT 0
19 #if DEBUG_PRINT
20 #define dprintf(args...) printf(args)
21 #else
22 #define dprintf(args...)
23 #endif
24
25 /*
26 * Until Radar 2731877 (CFURLCreateWithBytes chokes on '|') is fixed, we skip the
27 * fetching of anything with that character.
28 */
29 #define SKIP_MULTI_QUERIES 1
30
31 /*
32 * List of servers known to NOT support SSL; if images from these servers are
33 * needed, get them via http:.
34 */
35 static const char *nonSslSites[] = {
36 "cover2.cduniverse.com",
37 "a248.e.akamai.net",
38 NULL
39 };
40
41 /* return nonzero if specified host is in nonSslSites */
42 static int isHostNonSsl(
43 const char *host)
44 {
45 const char **nss = nonSslSites;
46 while(*nss != NULL) {
47 if(!strcmp(*nss, host)) {
48 return 1;
49 }
50 nss++;
51 }
52 return 0;
53 }
54
55 /*
56 * Used to force single-threaded access to URLSimpleDownload.
57 */
58 static Mutex urlLock;
59
60 static void urlThreadLock()
61 {
62 urlLock.lock();
63 }
64
65 static void urlThreadUnlock()
66 {
67 urlLock.unlock();
68 }
69
70 /*
71 * Parameters for one thread, which fetches the contents of one URL (in this
72 * case, an image source).
73 */
74 typedef struct {
75 /* in */
76 const char *host;
77 char path[MAX_PATH_LEN];
78 bool isSsl;
79 bool useCfNet;
80 int singleThread;
81 int quiet;
82 pthread_t pthr;
83 unsigned threadNum;
84
85 /* out */
86 OSStatus ortn;
87 unsigned bytesRead;
88 } ThreadParams;
89
90 static void usage(char **argv)
91 {
92 printf("%s hostname path [options]\n", argv[0]);
93 printf("Options:\n");
94 printf(" u (URLAccess; default is CFNetwork)\n");
95 printf(" s connect via SSL\n");
96 printf(" t single thread access to URLSimpleDownload\n");
97 printf(" q quiet\n");
98 exit(1);
99 }
100
101 static void printUrl(
102 const char *host,
103 const char *path,
104 int isSsl)
105 {
106 if(isSsl) {
107 printf("https://%s%s", host, path);
108 }
109 else {
110 printf("http://%s%s", host, path);
111 }
112 }
113
114 /*
115 * Given a hostname, path, and SSL flag, fetch the data, optionally
116 * returning it in the form of a CFDataRef.
117 */
118 static OSStatus fetchUrl(
119 const char *host,
120 const char *path,
121 bool isSsl,
122 bool useCfNet,
123 int singleThread,
124 unsigned *bytesRead, // RETURNED, always
125 CFDataRef *cfData) // optional, RETURNED
126 {
127 char url[MAX_URL_LEN];
128 char *scheme;
129 OSStatus ortn;
130
131 *bytesRead = 0;
132 if(isSsl) {
133 scheme = "https://";
134 }
135 else {
136 scheme = "http://";
137 }
138 sprintf(url, "%s%s%s", scheme, host, path);
139 if(singleThread) {
140 urlThreadLock();
141 }
142 if(useCfNet) {
143 CFDataRef cd = cfSimpleGet(url);
144 if(cd) {
145 /* always report this */
146 *bytesRead = CFDataGetLength(cd);
147 if(cfData) {
148 /* optional */
149 *cfData = cd;
150 }
151 else {
152 /* caller doesn't want */
153 CFRelease(cd);
154 }
155 ortn = noErr;
156 }
157 else {
158 printf("implied ioErr from cfnet\n");
159 ortn = ioErr;
160 }
161 }
162 else {
163 /* original URLAccess mechanism */
164
165 Handle h = NewHandle(0);
166 ortn = URLSimpleDownload(url,
167 NULL,
168 h,
169 0, //kURLDisplayProgressFlag,
170 NULL, //eventCallback,
171 NULL); // userContext
172 *bytesRead = GetHandleSize(h);
173 if((cfData != NULL) && (ortn == noErr)) {
174 CFDataRef cd = CFDataCreate(NULL, (UInt8 *)*h, *bytesRead);
175 *cfData = cd;
176 }
177 if(ortn) {
178 printf("%d returned from URLSimpleDownload\n", (int)ortn);
179 }
180 DisposeHandle(h);
181 }
182 if(singleThread) {
183 urlThreadUnlock();
184 }
185 dprintf("...read %d bytes from %s\n", (int)(*bytesRead), url);
186 return ortn;
187 }
188
189 /*
190 * Main pthread body, fetches source for one image.
191 */
192 static void *imageThread(void *arg)
193 {
194 ThreadParams *params = (ThreadParams *)arg;
195
196 params->ortn = fetchUrl(params->host,
197 params->path,
198 params->isSsl,
199 params->useCfNet,
200 params->singleThread,
201 &params->bytesRead,
202 NULL); // don't want the data
203 pthread_exit(NULL);
204 /* NOT REACHED */
205 return NULL;
206 }
207
208 /*
209 * Given a Handle supposedly associated with a page of HTML, do an el-cheapo parse
210 * of the HTML looking for IMG SRC tags. Fork off a thread for each image. Wait for
211 * each thread to complete. Returns total number of errors of any kind found.
212 */
213 static int fetchImages(
214 CFDataRef cfData,
215 const char *host,
216 const char *origPath,
217 int isSsl,
218 bool useCfNet,
219 int singleThread,
220 int quiet)
221 {
222 char *mungedHtml;
223 Size mungedLen;
224 char *cp;
225 char *imageNameStart;
226 char *imageNameEnd;
227 unsigned imageNameLen;
228 ThreadParams *params = NULL; // big array
229 ThreadParams *thisThread;
230 unsigned threadDex;
231 int prtn;
232 unsigned numThreads = 0; // valid entries in params[]
233 int totalErrors = 0;
234 char *basePath = NULL;
235
236 /*
237 * If the original path ends in '/', use it as basePath.
238 * Else strip off trailing component.
239 */
240 unsigned origPathLen = strlen(origPath);
241 basePath = strdup(origPath);
242 if(origPath[origPathLen - 1] != '/') {
243 /* trim */
244 unsigned basePathLen = origPathLen;
245 for(char *cp=basePath + origPathLen - 1; cp > basePath; cp--) {
246 basePathLen--;
247 if(*cp == '/') {
248 /* found the last one - string ends here */
249 cp[1] = '\0';
250 break;
251 }
252 }
253 }
254 /* re-alloc the raw source as a NULL-terminated C string for easy str-based
255 * parsing */
256 mungedLen = CFDataGetLength(cfData);
257 if(mungedLen == 0) {
258 printf("***size() of main page is zero!\n");
259 return 0;
260 }
261 mungedLen++;
262 mungedHtml = (char *)malloc(mungedLen);
263 memmove(mungedHtml, CFDataGetBytePtr(cfData), mungedLen-1);
264 mungedHtml[mungedLen - 1] = '\0';
265
266 /* create a ThreadParams array big enough for most purposes */
267 params = (ThreadParams *)malloc(sizeof(ThreadParams) * MAX_THREADS);
268
269 /* start of el cheapo parse. Upper-case all "img src" into "IMG SRC". */
270 for(;;) {
271 cp = strstr(mungedHtml, "img src");
272 if(cp == NULL) {
273 break;
274 }
275 memmove(cp, "IMG SRC", 7);
276 cp += 7;
277 }
278
279 /* convert all '\' to '/' - some URLs (e.g. from cduniverse.com) out there
280 * use the backslash, but CF's URL can't deal with it */
281 for(;;) {
282 cp = strchr(mungedHtml, '\\');
283 if(cp == NULL) {
284 break;
285 }
286 *cp = '/';
287 }
288
289 /* search for "IMG SRC", fork off thread to fetch each one's image */
290 cp = mungedHtml;
291 for(;;) {
292 cp = strstr(cp, "IMG SRC=");
293 if(cp == NULL) {
294 break;
295 }
296
297 /* get ptr to start of image file name */
298 cp += 8;
299 if(*cp == '"') {
300 /* e.g., <IMG SRC="foobar.gif"> */
301 imageNameStart = ++cp;
302 imageNameEnd = strchr(imageNameStart, '"');
303 }
304 else {
305 /* e.g., <IMG SRC=foobar.gif> */
306 char *nextSpace;
307 imageNameStart = cp;
308 imageNameEnd = strchr(imageNameStart, '>');
309 nextSpace = strchr(imageNameStart, ' ');
310 if((imageNameEnd == NULL) || (imageNameEnd > nextSpace)) {
311 imageNameEnd = nextSpace;
312 }
313 }
314 if(imageNameEnd == NULL) {
315 printf("***Bad HTML - missing quote/bracket after image file name\n");
316 continue;
317 }
318 cp = imageNameEnd;
319
320 /* fill in a ThreadParams */
321 thisThread = &params[numThreads];
322 thisThread->host = host;
323 thisThread->isSsl = isSsl;
324 thisThread->useCfNet = useCfNet;
325 thisThread->singleThread = singleThread;
326 thisThread->threadNum = numThreads;
327 thisThread->quiet = quiet;
328 thisThread->ortn = -1;
329
330 /* path may be relative to basePath or a fully qualified URL */
331 imageNameLen = imageNameEnd - imageNameStart;
332 if(imageNameStart[0] == '/') {
333 /* absolute path, use as is */
334 memmove(thisThread->path, imageNameStart, imageNameLen);
335 thisThread->path[imageNameLen] = '\0';
336 }
337 else if(strncmp(imageNameStart, "http", 4) == 0) {
338 /* skip "http://" or "https://"; host name goes from after
339 * tha until next '/' */
340 const char *hostStart = strstr(imageNameStart, "//");
341 if((hostStart == NULL) || (hostStart > (imageNameEnd-2))) {
342 /* hmmm...punt */
343 continue;
344 }
345 hostStart += 2;
346 const char *hostEnd = strchr(hostStart, '/');
347 if(hostEnd >= imageNameEnd) {
348 /* punt */
349 continue;
350 }
351 /* we're gonna leak this host string for now */
352 unsigned hostLen = hostEnd - hostStart;
353 char *hostStr = (char *)malloc(hostLen + 1);
354 memmove(hostStr, hostStart, hostLen);
355 hostStr[hostLen] = '\0';
356 thisThread->host = (const char *)hostStr;
357 /* remainder is path */
358 /* FIXME - may have to deal with port number, currently in host string */
359 memmove(thisThread->path, hostEnd, imageNameEnd-hostEnd);
360 thisThread->path[imageNameEnd-hostEnd] = '\0';
361
362 if(isSsl && isHostNonSsl(hostStr)) {
363 /* some sites, e.g., cdu1.cduniverse.com, reference images
364 * which are NOT available via SSL */
365 thisThread->isSsl = 0;
366 }
367 }
368 else {
369 /* path := basePath | relativePath */
370 unsigned basePathLen = strlen(basePath);
371 memmove(thisThread->path, basePath, basePathLen);
372 memmove(thisThread->path + basePathLen, imageNameStart, imageNameLen);
373 thisThread->path[basePathLen + imageNameLen] = '\0';
374 }
375
376 #if SKIP_MULTI_QUERIES
377 if(strchr(thisThread->path, '|')) {
378 /* CFURLCreateWithBytes will choke, so will URLSimpleDownload */
379 continue;
380 }
381 #endif
382
383 /* fork off a thread to fetch it */
384 if(!quiet) {
385 printf(" ");
386 printUrl(thisThread->host, thisThread->path, thisThread->isSsl);
387 printf(": thread %u : forking imageThread\n",
388 thisThread->threadNum);
389 }
390 prtn = pthread_create(&thisThread->pthr,
391 NULL,
392 imageThread,
393 thisThread);
394 if(prtn) {
395 printf("***Error creating pthread (%d)\n", prtn);
396 totalErrors++;
397 break;
398 }
399 numThreads++;
400 if(numThreads == MAX_THREADS) {
401 /* OK, that's enough */
402 break;
403 }
404 }
405 free(mungedHtml);
406
407 /* wait for each thread to complete */
408 if(!quiet) {
409 printf(" waiting for image threads to complete...\n");
410 }
411 for(threadDex=0; threadDex<numThreads; threadDex++) {
412 void *status;
413 thisThread = &params[threadDex];
414 prtn = pthread_join(thisThread->pthr, &status);
415 if(prtn) {
416 printf("***pthread_join returned %d, aborting\n", prtn);
417 totalErrors++;
418 break;
419 }
420 if(!quiet || thisThread->ortn) {
421 printf(" ");
422 printUrl(thisThread->host, thisThread->path, thisThread->isSsl);
423 printf(": thread %u : fetch result %d, read %d bytes\n",
424 thisThread->threadNum,
425 (int)thisThread->ortn, thisThread->bytesRead);
426 }
427 if(thisThread->ortn) {
428 totalErrors++;
429 }
430 }
431 free(params);
432 return totalErrors;
433 }
434
435 int main(int argc, char **argv)
436 {
437 bool isSsl = false;
438 bool useCfNet = true;
439 int singleThread = 0;
440 int quiet = 0;
441 OSStatus ortn;
442 int arg;
443 CFDataRef cfData;
444 char *host;
445 char *path;
446 int ourRtn = 0;
447
448 if(argc < 3) {
449 usage(argv);
450 }
451 host = argv[1];
452 path = argv[2];
453 for(arg=3; arg<argc; arg++) {
454 switch(argv[arg][0]) {
455 case 's':
456 isSsl = true;
457 break;
458 case 'u':
459 useCfNet = false;
460 break;
461 case 't':
462 singleThread = 1;
463 break;
464 case 'q':
465 quiet = 1;
466 break;
467 default:
468 usage(argv);
469 }
470 }
471
472 /* first get the main body of html text */
473 printf("...fetching page at ");
474 printUrl(host, path, isSsl);
475 printf("\n");
476 unsigned bytesRead;
477 ortn = fetchUrl(host, path, isSsl, useCfNet, singleThread, &bytesRead, &cfData);
478 if(ortn) {
479 printf("***Error %d fetching from host %s path %s\n", (int)ortn, host, path);
480 exit(1);
481 }
482
483 /* parse the HTML, forking off a thread for each IMG SRC found */
484 ourRtn = fetchImages(cfData, host, path, isSsl, useCfNet, singleThread, quiet);
485 CFRelease(cfData);
486 if(ourRtn) {
487 printf("===%s exiting with %d %s for host %s\n", argv[0], ourRtn,
488 (ourRtn > 1) ? "errors" : "error", host);
489 }
490 return ourRtn;
491 }