Blame - ap/lib/libcurl/curl-7.86.0/docs/examples/crawler.c - T106_DC

blob: 1859c274e545241b3ccb604590c26e8d21cf15df [file] [log] [blame]

xf.li	6c8fc1e	2023-08-12 00:11:09 -0700	[diff] [blame^]	1	/***************************************************************************
				2	* _ _ ____ _
				3	* Project ___\| \| \| \| _ \\| \|
				4	* / __\| \| \| \| \|_) \| \|
				5	* \| (__\| \|_\| \| _ <\| \|___
				6	* \___\|\___/\|_\| \_\_____\|
				7	*
				8	* Copyright (C) 2018 - 2022 Jeroen Ooms <jeroenooms@gmail.com>
				9	*
				10	* This software is licensed as described in the file COPYING, which
				11	* you should have received as part of this distribution. The terms
				12	* are also available at https://curl.se/docs/copyright.html.
				13	*
				14	* You may opt to use, copy, modify, merge, publish, distribute and/or sell
				15	* copies of the Software, and permit persons to whom the Software is
				16	* furnished to do so, under the terms of the COPYING file.
				17	*
				18	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
				19	* KIND, either express or implied.
				20	*
				21	* SPDX-License-Identifier: curl
				22	*
				23	* To compile:
				24	* gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
				25	*
				26	*/
				27	/* <DESC>
				28	* Web crawler based on curl and libxml2 to stress-test curl with
				29	* hundreds of concurrent connections to various servers.
				30	* </DESC>
				31	*/
				32
				33	/* Parameters */
				34	int max_con = 200;
				35	int max_total = 20000;
				36	int max_requests = 500;
				37	int max_link_per_page = 5;
				38	int follow_relative_links = 0;
				39	char *start_page = "https://www.reuters.com";
				40
				41	#include <libxml/HTMLparser.h>
				42	#include <libxml/xpath.h>
				43	#include <libxml/uri.h>
				44	#include <curl/curl.h>
				45	#include <stdlib.h>
				46	#include <string.h>
				47	#include <math.h>
				48	#include <signal.h>
				49
				50	int pending_interrupt = 0;
				51	void sighandler(int dummy)
				52	{
				53	pending_interrupt = 1;
				54	}
				55
				56	/* resizable buffer */
				57	typedef struct {
				58	char *buf;
				59	size_t size;
				60	} memory;
				61
				62	size_t grow_buffer(void contents, size_t sz, size_t nmemb, void ctx)
				63	{
				64	size_t realsize = sz * nmemb;
				65	memory mem = (memory) ctx;
				66	char *ptr = realloc(mem->buf, mem->size + realsize);
				67	if(!ptr) {
				68	/* out of memory */
				69	printf("not enough memory (realloc returned NULL)\n");
				70	return 0;
				71	}
				72	mem->buf = ptr;
				73	memcpy(&(mem->buf[mem->size]), contents, realsize);
				74	mem->size += realsize;
				75	return realsize;
				76	}
				77
				78	CURL make_handle(char url)
				79	{
				80	CURL *handle = curl_easy_init();
				81
				82	/* Important: use HTTP2 over HTTPS */
				83	curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
				84	curl_easy_setopt(handle, CURLOPT_URL, url);
				85
				86	/* buffer body */
				87	memory *mem = malloc(sizeof(memory));
				88	mem->size = 0;
				89	mem->buf = malloc(1);
				90	curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
				91	curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
				92	curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
				93
				94	/* For completeness */
				95	curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
				96	curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
				97	curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
				98	curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
				99	curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
				100	curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
				101	curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
				102	curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
				103	curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
				104	curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
				105	curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
				106	curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
				107	return handle;
				108	}
				109
				110	/* HREF finder implemented in libxml2 but could be any HTML parser */
				111	size_t follow_links(CURLM multi_handle, memory mem, char *url)
				112	{
				113	int opts = HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| \
				114	HTML_PARSE_NOWARNING \| HTML_PARSE_NONET;
				115	htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
				116	if(!doc)
				117	return 0;
				118	xmlChar xpath = (xmlChar) "//a/@href";
				119	xmlXPathContextPtr context = xmlXPathNewContext(doc);
				120	xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
				121	xmlXPathFreeContext(context);
				122	if(!result)
				123	return 0;
				124	xmlNodeSetPtr nodeset = result->nodesetval;
				125	if(xmlXPathNodeSetIsEmpty(nodeset)) {
				126	xmlXPathFreeObject(result);
				127	return 0;
				128	}
				129	size_t count = 0;
				130	int i;
				131	for(i = 0; i < nodeset->nodeNr; i++) {
				132	double r = rand();
				133	int x = r * nodeset->nodeNr / RAND_MAX;
				134	const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
				135	xmlChar *href = xmlNodeListGetString(doc, node, 1);
				136	if(follow_relative_links) {
				137	xmlChar *orig = href;
				138	href = xmlBuildURI(href, (xmlChar *) url);
				139	xmlFree(orig);
				140	}
				141	char link = (char ) href;
				142	if(!link \|\| strlen(link) < 20)
				143	continue;
				144	if(!strncmp(link, "http://", 7) \|\| !strncmp(link, "https://", 8)) {
				145	curl_multi_add_handle(multi_handle, make_handle(link));
				146	if(count++ == max_link_per_page)
				147	break;
				148	}
				149	xmlFree(link);
				150	}
				151	xmlXPathFreeObject(result);
				152	return count;
				153	}
				154
				155	int is_html(char *ctype)
				156	{
				157	return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
				158	}
				159
				160	int main(void)
				161	{
				162	signal(SIGINT, sighandler);
				163	LIBXML_TEST_VERSION;
				164	curl_global_init(CURL_GLOBAL_DEFAULT);
				165	CURLM *multi_handle = curl_multi_init();
				166	curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
				167	curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
				168
				169	/* enables http/2 if available */
				170	#ifdef CURLPIPE_MULTIPLEX
				171	curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
				172	#endif
				173
				174	/* sets html start page */
				175	curl_multi_add_handle(multi_handle, make_handle(start_page));
				176
				177	int msgs_left;
				178	int pending = 0;
				179	int complete = 0;
				180	int still_running = 1;
				181	while(still_running && !pending_interrupt) {
				182	int numfds;
				183	curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
				184	curl_multi_perform(multi_handle, &still_running);
				185
				186	/* See how the transfers went */
				187	CURLMsg *m = NULL;
				188	while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
				189	if(m->msg == CURLMSG_DONE) {
				190	CURL *handle = m->easy_handle;
				191	char *url;
				192	memory *mem;
				193	curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
				194	curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
				195	if(m->data.result == CURLE_OK) {
				196	long res_status;
				197	curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
				198	if(res_status == 200) {
				199	char *ctype;
				200	curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
				201	printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
				202	if(is_html(ctype) && mem->size > 100) {
				203	if(pending < max_requests && (complete + pending) < max_total) {
				204	pending += follow_links(multi_handle, mem, url);
				205	still_running = 1;
				206	}
				207	}
				208	}
				209	else {
				210	printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
				211	}
				212	}
				213	else {
				214	printf("[%d] Connection failure: %s\n", complete, url);
				215	}
				216	curl_multi_remove_handle(multi_handle, handle);
				217	curl_easy_cleanup(handle);
				218	free(mem->buf);
				219	free(mem);
				220	complete++;
				221	pending--;
				222	}
				223	}
				224	}
				225	curl_multi_cleanup(multi_handle);
				226	curl_global_cleanup();
				227	return 0;
				228	}