PDA

View Full Version : loads of http at once



Yakman
11-07-2008, 07:56 PM
was messing around with nonblocking sockets and select()
made this,

downloads loads of pages at once with http and saves to a file.
doesn't parse it or anything, so the file has http headers.


trail run


| connecting to www.utas.edu.au...
| connecting to www.moparisthebest.com...
| connecting to rafb.net...
| connecting to www.google.co.uk...
| connecting to en.wikipedia.org...
| connecting to www.imageshack.us...
| connecting to www.richarddawkins.net...
| connecting to www.hevanet.com...
| connecting to beej.us...
| connecting to graphics.stanford.edu...
|| connected to www.moparisthebest.com in 1 secs, 4294282325 usecs
|| connected to rafb.net in 1 secs, 4294262155 usecs
|| connected to www.google.co.uk in 1 secs, 4294242404 usecs
|| connected to en.wikipedia.org in 1 secs, 4294219091 usecs
||| sent all data to www.moparisthebest.com in 0 secs, 37 usecs
||| sent all data to rafb.net in 0 secs, 151 usecs
||| sent all data to www.google.co.uk in 0 secs, 182 usecs
||| sent all data to en.wikipedia.org in 0 secs, 214 usecs
|| connected to www.utas.edu.au in 1 secs, 4294327242 usecs
||| sent all data to www.utas.edu.au in 0 secs, 24 usecs
|| connected to www.richarddawkins.net in 0 secs, 114947 usecs
||| sent all data to www.richarddawkins.net in 0 secs, 19 usecs
|||| recv'd all data from www.google.co.uk in 0 secs, 84416 usecs
|| connected to www.imageshack.us in 0 secs, 189829 usecs
||| sent all data to www.imageshack.us in 0 secs, 20 usecs
|| connected to beej.us in 0 secs, 177506 usecs
||| sent all data to beej.us in 0 secs, 21 usecs
|| connected to www.hevanet.com in 0 secs, 198819 usecs
||| sent all data to www.hevanet.com in 0 secs, 30 usecs
|| connected to graphics.stanford.edu in 0 secs, 181828 usecs
|||| recv'd all data from en.wikipedia.org in 0 secs, 181548 usecs
||| sent all data to graphics.stanford.edu in 0 secs, 141 usecs
|||| recv'd all data from rafb.net in 0 secs, 212302 usecs
|||| recv'd all data from beej.us in 0 secs, 374370 usecs
|||| recv'd all data from www.hevanet.com in 0 secs, 400847 usecs
|||| recv'd all data from www.richarddawkins.net in 0 secs, 620682 usecs
|||| recv'd all data from www.moparisthebest.com in 0 secs, 780372 usecs
|||| recv'd all data from www.imageshack.us in 0 secs, 720049 usecs
|||| recv'd all data from graphics.stanford.edu in 1 secs, 4294934726 usecs
|||| recv'd all data from www.utas.edu.au in 2 secs, 4294824835 usecs
** finished all jobs



the code




/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

/*
downloads loads of things at once with http
yakman
*/

#include <sys/time.h>
#include <sys/socket.h>
#include <unistd.h>
#include <fcntl.h>
#include <netdb.h>

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>


//stuff below parses a url
#define MAX_URL_LENGTH 256

typedef struct URL {
char host[128];
char file[128];
int port;
} URL;

int
parseURL(URL* url, const char* string, char* error_msg, int error_size) {

if(strlen(string) > MAX_URL_LENGTH) {
snprintf(error_msg, error_size, "string too long, max length is %d", MAX_URL_LENGTH);
return 1;
}

if(strncmp(string, "http://", 7) != 0) {
snprintf(error_msg, error_size, "wrong protocol, has to be \"http\"");
return 1;
}
string += 7;

char* end_of_host = strpbrk(string, "/:");
if(end_of_host == NULL) {
url->file[0] = '/';
url->file[1] = '\0';
url->port = 80;
strcpy(url->host, string);
return 0;
}

memcpy(url->host, string, (end_of_host - string));
url->host[(end_of_host - string)] = '\0';
char buf[8];

switch(*end_of_host) {
case '/': {
url->port = 80;
strcpy(url->file, end_of_host);
} break;
case ':': {
end_of_host++;
char* end_of_port = strchr(end_of_host, '/');
url->file[0] = '/';
url->file[1] = '\0';
if(end_of_port != NULL) {
strcpy(url->file + 1, end_of_port + 1);
memset(buf, 0, sizeof(buf));
memcpy(buf, end_of_host, (end_of_port - end_of_host));
end_of_host = buf;
}

url->port = atoi(end_of_host);
} break;
default: {
snprintf(error_msg, error_size, "[BUG] logic error line:%d *eoh = '%c' = 0x%x", __LINE__, *end_of_host, *end_of_host);
return 1;
}
}

return 0;
}

char*
make_file_name(URL* url) {
static char buf[MAX_URL_LENGTH];

int hl = strlen(url->host);
int fl = strlen(url->file);

memcpy(buf, url->host, hl);
memcpy(buf + hl, url->file, fl + 1); //+1 to get the terminator

char* t;
while(t = strchr(buf, '/'))
*t = '.';
return buf;
}

typedef struct client {
int status; //0= need to connect() 1= need to send data 2= need to read data
int fd; //socket fd

int pos; //buffer position
int len; //buffer len
char buf[512]; //send buffer

FILE* output; //output file
struct timeval tv;
URL url; //url

struct client* next; //linked list
} client;

client*
parse_command_line(int argc, char** argv) {
client* head = NULL;
client* prev;
for(int f = 1; f < argc; f++) {

client* node = malloc(sizeof(client));
memset(node, 0, sizeof(client));
if(!head)
head = prev = node;

char err[512];
if(0 != parseURL(&node->url, argv[f], err, sizeof(err))) {
fprintf(stderr, "parseURL(): <%s> %s\n", argv[f], err);
return NULL;
}

prev->next = node;
prev = node;
if(f == argc-1)
node->next = NULL;
}
return head;
}

char format[] = "GET %s HTTP/1.1\r\n"
"Accept: */*\r\n"
"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506; InfoPath.2)\r\n"
"Host: %s\r\n"
"Connection: Close\r\n\r\n";

int
main(int argc, char** argv) {
if(argc == 1)
return fprintf(stderr, "%s [url..]\n", argv[0]);
client* head = parse_command_line(argc, argv);
if(!head)
return EXIT_FAILURE;

client *c = head;
do { //loop through clients opening files and sending syns
c->fd = socket(AF_INET, SOCK_STREAM, 0);
if(c->fd < 0)
return fprintf(stderr, "socket(): %s\n", strerror(errno));

c->len = sprintf(c->buf, format, c->url.file, c->url.host);

//lookup host
struct hostent* he = gethostbyname(c->url.host);
if(he == NULL)
return fprintf(stderr, "gethostbyname(): <%s> %s\n", c->url.host, strerror(errno));

//set nonblocking
int arg = fcntl(c->fd, F_GETFL, NULL);
arg |= O_NONBLOCK;
fcntl(c->fd, F_SETFL, arg);

struct sockaddr_in address;
address.sin_family = AF_INET;
address.sin_port = htons(c->url.port);
address.sin_addr.s_addr = ((struct in_addr*)he->h_addr)->s_addr;

//start the connect sequence
errno = 0;
if(connect(c->fd, (struct sockaddr*)&address, sizeof(struct sockaddr)) > 0)
return fprintf(stderr, "connect returned unexpected\n");
gettimeofday(&c->tv, NULL);
printf(" | connecting to %s...\n", c->url.host);

if(errno != EINPROGRESS)
return fprintf(stderr, "unexpected: %s\n", strerror(errno));

c = c->next;
} while(c);


while(head) {

int maxfd = 0;
#define update_fd(x) maxfd = (maxfd > x ? maxfd : x)

fd_set rset, wset;
FD_ZERO(&rset);
FD_ZERO(&wset);

//fill up fd sets
client* s = head;
do {

switch(s->status) {
case 0: { //need to connect
FD_SET(s->fd, &wset);
update_fd(s->fd);
} break;
case 1: { //need to send data
FD_SET(s->fd, &wset);
update_fd(s->fd);
} break;
case 2: { //need to recv data
FD_SET(s->fd, &rset);
update_fd(s->fd);
} break;
default:
return fprintf(stderr, "unexpected status l:%d <%d>\n", __LINE__, s->status);
}

s = s->next;
} while(s);


if(select(maxfd + 1, &rset, &wset, NULL, NULL) < 0)
return fprintf(stderr, "select(): %s\n", strerror(errno));


//process fds
client* h = head;
client* t = NULL;
client* prev = NULL;
do {

switch(h->status) {
case 0: { //need to connect
if(!FD_ISSET(h->fd, &wset)) //connected
break;
struct timeval tv;
gettimeofday(&tv, NULL);
printf(" || connected to %s in %d secs, %u usecs\n",
h->url.host, tv.tv_sec - h->tv.tv_sec, tv.tv_usec - h->tv.tv_usec);
gettimeofday(&h->tv, NULL);
h->status++;

} break;
case 1: { //need to send

if(!FD_ISSET(h->fd, &wset))
break;
int r = send(h->fd, h->buf + h->pos, h->len - h->pos, 0);
if(r < 1)
return fprintf(stderr, "send(): %s\n", strerror(errno));
h->pos += r;

if(h->pos == h->len) {//sent all the data
struct timeval tv;
gettimeofday(&tv, NULL);
printf(" ||| sent all data to %s in %d secs, %u usecs\n",
h->url.host, tv.tv_sec - h->tv.tv_sec, tv.tv_usec - h->tv.tv_usec);
gettimeofday(&h->tv, NULL);

h->status++;
h->output = fopen(make_file_name(&h->url), "wb");
if(!h->output)
return fprintf(stderr, "fopen(): %s\n", strerror(errno));
}
} break;
case 2: { //need to recv
if(!FD_ISSET(h->fd, &rset))
break;
char buffer[1024];
errno = 0;
int r = recv(h->fd, buffer, sizeof(buffer), 0);
if(r < 1) { //eof or error
if(errno)
return fprintf(stderr, "recv(): %s\n", strerror(errno));

struct timeval tv;
gettimeofday(&tv, NULL);
printf(" |||| recv'd all data from %s in %d secs, %u usecs\n",
h->url.host, tv.tv_sec - h->tv.tv_sec, tv.tv_usec - h->tv.tv_usec);
fclose(h->output);
close(h->fd);

if(prev)
prev->next = h->next; //not the head
else
if(h->next)
head = h->next; //the head, but not the last element
else
head = NULL;//the head, also the last element
t = h->next;
free(h);
h = NULL;
break;
}

fwrite(buffer, 1, r, h->output);
} break;
default:
return fprintf(stderr, "unexpected status l:%d <%d>\n", __LINE__, s->status);
}

if(h) {
prev = h; //if this element was freed
h = h->next;
} else {
h = t;
}
} while(h);
}

printf("** finished all jobs\n");
return EXIT_SUCCESS;
}



post criticism

TViYH
11-07-2008, 09:22 PM
Jesus! I really need to learn C.. Looks very complicated to learn.

R0b0t1
11-07-2008, 10:00 PM
<insert witty criticism no one expected>

Wizzup?
11-11-2008, 08:36 PM
Cool.
Did you find a real use for this yet? :p (Next to experience)