]>
git.saurik.com Git - apple/security.git/blob - SecurityTests/clxutils/urlPageGrab/urlPageGrab.cpp
2 * urlPageGrab - download a page and all of the image sources referenced on
7 #include <security_utilities/threading.h>
8 #include <Carbon/Carbon.h>
12 #include "cfSimpleGet.h"
14 #define MAX_PATH_LEN 256
15 #define MAX_URL_LEN 1024
16 #define MAX_THREADS 100
20 #define dprintf(args...) printf(args)
22 #define dprintf(args...)
26 * Until Radar 2731877 (CFURLCreateWithBytes chokes on '|') is fixed, we skip the
27 * fetching of anything with that character.
29 #define SKIP_MULTI_QUERIES 1
32 * List of servers known to NOT support SSL; if images from these servers are
33 * needed, get them via http:.
35 static const char *nonSslSites
[] = {
36 "cover2.cduniverse.com",
41 /* return nonzero if specified host is in nonSslSites */
42 static int isHostNonSsl(
45 const char **nss
= nonSslSites
;
47 if(!strcmp(*nss
, host
)) {
56 * Used to force single-threaded access to URLSimpleDownload.
60 static void urlThreadLock()
65 static void urlThreadUnlock()
71 * Parameters for one thread, which fetches the contents of one URL (in this
72 * case, an image source).
77 char path
[MAX_PATH_LEN
];
90 static void usage(char **argv
)
92 printf("%s hostname path [options]\n", argv
[0]);
94 printf(" u (URLAccess; default is CFNetwork)\n");
95 printf(" s connect via SSL\n");
96 printf(" t single thread access to URLSimpleDownload\n");
101 static void printUrl(
107 printf("https://%s%s", host
, path
);
110 printf("http://%s%s", host
, path
);
115 * Given a hostname, path, and SSL flag, fetch the data, optionally
116 * returning it in the form of a CFDataRef.
118 static OSStatus
fetchUrl(
124 unsigned *bytesRead
, // RETURNED, always
125 CFDataRef
*cfData
) // optional, RETURNED
127 char url
[MAX_URL_LEN
];
138 sprintf(url
, "%s%s%s", scheme
, host
, path
);
143 CFDataRef cd
= cfSimpleGet(url
);
145 /* always report this */
146 *bytesRead
= CFDataGetLength(cd
);
152 /* caller doesn't want */
158 printf("implied ioErr from cfnet\n");
163 /* original URLAccess mechanism */
165 Handle h
= NewHandle(0);
166 ortn
= URLSimpleDownload(url
,
169 0, //kURLDisplayProgressFlag,
170 NULL
, //eventCallback,
171 NULL
); // userContext
172 *bytesRead
= GetHandleSize(h
);
173 if((cfData
!= NULL
) && (ortn
== noErr
)) {
174 CFDataRef cd
= CFDataCreate(NULL
, (UInt8
*)*h
, *bytesRead
);
178 printf("%d returned from URLSimpleDownload\n", (int)ortn
);
185 dprintf("...read %d bytes from %s\n", (int)(*bytesRead
), url
);
190 * Main pthread body, fetches source for one image.
192 static void *imageThread(void *arg
)
194 ThreadParams
*params
= (ThreadParams
*)arg
;
196 params
->ortn
= fetchUrl(params
->host
,
200 params
->singleThread
,
202 NULL
); // don't want the data
209 * Given a Handle supposedly associated with a page of HTML, do an el-cheapo parse
210 * of the HTML looking for IMG SRC tags. Fork off a thread for each image. Wait for
211 * each thread to complete. Returns total number of errors of any kind found.
213 static int fetchImages(
216 const char *origPath
,
225 char *imageNameStart
;
227 unsigned imageNameLen
;
228 ThreadParams
*params
= NULL
; // big array
229 ThreadParams
*thisThread
;
232 unsigned numThreads
= 0; // valid entries in params[]
234 char *basePath
= NULL
;
237 * If the original path ends in '/', use it as basePath.
238 * Else strip off trailing component.
240 unsigned origPathLen
= strlen(origPath
);
241 basePath
= strdup(origPath
);
242 if(origPath
[origPathLen
- 1] != '/') {
244 unsigned basePathLen
= origPathLen
;
245 for(char *cp
=basePath
+ origPathLen
- 1; cp
> basePath
; cp
--) {
248 /* found the last one - string ends here */
254 /* re-alloc the raw source as a NULL-terminated C string for easy str-based
256 mungedLen
= CFDataGetLength(cfData
);
258 printf("***size() of main page is zero!\n");
262 mungedHtml
= (char *)malloc(mungedLen
);
263 memmove(mungedHtml
, CFDataGetBytePtr(cfData
), mungedLen
-1);
264 mungedHtml
[mungedLen
- 1] = '\0';
266 /* create a ThreadParams array big enough for most purposes */
267 params
= (ThreadParams
*)malloc(sizeof(ThreadParams
) * MAX_THREADS
);
269 /* start of el cheapo parse. Upper-case all "img src" into "IMG SRC". */
271 cp
= strstr(mungedHtml
, "img src");
275 memmove(cp
, "IMG SRC", 7);
279 /* convert all '\' to '/' - some URLs (e.g. from cduniverse.com) out there
280 * use the backslash, but CF's URL can't deal with it */
282 cp
= strchr(mungedHtml
, '\\');
289 /* search for "IMG SRC", fork off thread to fetch each one's image */
292 cp
= strstr(cp
, "IMG SRC=");
297 /* get ptr to start of image file name */
300 /* e.g., <IMG SRC="foobar.gif"> */
301 imageNameStart
= ++cp
;
302 imageNameEnd
= strchr(imageNameStart
, '"');
305 /* e.g., <IMG SRC=foobar.gif> */
308 imageNameEnd
= strchr(imageNameStart
, '>');
309 nextSpace
= strchr(imageNameStart
, ' ');
310 if((imageNameEnd
== NULL
) || (imageNameEnd
> nextSpace
)) {
311 imageNameEnd
= nextSpace
;
314 if(imageNameEnd
== NULL
) {
315 printf("***Bad HTML - missing quote/bracket after image file name\n");
320 /* fill in a ThreadParams */
321 thisThread
= ¶ms
[numThreads
];
322 thisThread
->host
= host
;
323 thisThread
->isSsl
= isSsl
;
324 thisThread
->useCfNet
= useCfNet
;
325 thisThread
->singleThread
= singleThread
;
326 thisThread
->threadNum
= numThreads
;
327 thisThread
->quiet
= quiet
;
328 thisThread
->ortn
= -1;
330 /* path may be relative to basePath or a fully qualified URL */
331 imageNameLen
= imageNameEnd
- imageNameStart
;
332 if(imageNameStart
[0] == '/') {
333 /* absolute path, use as is */
334 memmove(thisThread
->path
, imageNameStart
, imageNameLen
);
335 thisThread
->path
[imageNameLen
] = '\0';
337 else if(strncmp(imageNameStart
, "http", 4) == 0) {
338 /* skip "http://" or "https://"; host name goes from after
339 * tha until next '/' */
340 const char *hostStart
= strstr(imageNameStart
, "//");
341 if((hostStart
== NULL
) || (hostStart
> (imageNameEnd
-2))) {
346 const char *hostEnd
= strchr(hostStart
, '/');
347 if(hostEnd
>= imageNameEnd
) {
351 /* we're gonna leak this host string for now */
352 unsigned hostLen
= hostEnd
- hostStart
;
353 char *hostStr
= (char *)malloc(hostLen
+ 1);
354 memmove(hostStr
, hostStart
, hostLen
);
355 hostStr
[hostLen
] = '\0';
356 thisThread
->host
= (const char *)hostStr
;
357 /* remainder is path */
358 /* FIXME - may have to deal with port number, currently in host string */
359 memmove(thisThread
->path
, hostEnd
, imageNameEnd
-hostEnd
);
360 thisThread
->path
[imageNameEnd
-hostEnd
] = '\0';
362 if(isSsl
&& isHostNonSsl(hostStr
)) {
363 /* some sites, e.g., cdu1.cduniverse.com, reference images
364 * which are NOT available via SSL */
365 thisThread
->isSsl
= 0;
369 /* path := basePath | relativePath */
370 unsigned basePathLen
= strlen(basePath
);
371 memmove(thisThread
->path
, basePath
, basePathLen
);
372 memmove(thisThread
->path
+ basePathLen
, imageNameStart
, imageNameLen
);
373 thisThread
->path
[basePathLen
+ imageNameLen
] = '\0';
376 #if SKIP_MULTI_QUERIES
377 if(strchr(thisThread
->path
, '|')) {
378 /* CFURLCreateWithBytes will choke, so will URLSimpleDownload */
383 /* fork off a thread to fetch it */
386 printUrl(thisThread
->host
, thisThread
->path
, thisThread
->isSsl
);
387 printf(": thread %u : forking imageThread\n",
388 thisThread
->threadNum
);
390 prtn
= pthread_create(&thisThread
->pthr
,
395 printf("***Error creating pthread (%d)\n", prtn
);
400 if(numThreads
== MAX_THREADS
) {
401 /* OK, that's enough */
407 /* wait for each thread to complete */
409 printf(" waiting for image threads to complete...\n");
411 for(threadDex
=0; threadDex
<numThreads
; threadDex
++) {
413 thisThread
= ¶ms
[threadDex
];
414 prtn
= pthread_join(thisThread
->pthr
, &status
);
416 printf("***pthread_join returned %d, aborting\n", prtn
);
420 if(!quiet
|| thisThread
->ortn
) {
422 printUrl(thisThread
->host
, thisThread
->path
, thisThread
->isSsl
);
423 printf(": thread %u : fetch result %d, read %d bytes\n",
424 thisThread
->threadNum
,
425 (int)thisThread
->ortn
, thisThread
->bytesRead
);
427 if(thisThread
->ortn
) {
435 int main(int argc
, char **argv
)
438 bool useCfNet
= true;
439 int singleThread
= 0;
453 for(arg
=3; arg
<argc
; arg
++) {
454 switch(argv
[arg
][0]) {
472 /* first get the main body of html text */
473 printf("...fetching page at ");
474 printUrl(host
, path
, isSsl
);
477 ortn
= fetchUrl(host
, path
, isSsl
, useCfNet
, singleThread
, &bytesRead
, &cfData
);
479 printf("***Error %d fetching from host %s path %s\n", (int)ortn
, host
, path
);
483 /* parse the HTML, forking off a thread for each IMG SRC found */
484 ourRtn
= fetchImages(cfData
, host
, path
, isSsl
, useCfNet
, singleThread
, quiet
);
487 printf("===%s exiting with %d %s for host %s\n", argv
[0], ourRtn
,
488 (ourRtn
> 1) ? "errors" : "error", host
);