最近学深度学习,但是无奈很缺数据,所以就写了一个网页爬虫去爬取图像,(一个一个手动下载的话太烦了)
#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS
#define _WINSOCK_DEPRECATED_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#include <winsock2.h>
#include <Windows.h>
#include <string>
#include <iostream>
#include <vector>
#include <process.h>
#include <WinInet.h>
#include <assert.h>
#pragma comment(lib, "Wininet.lib")
#pragma comment(lib, "ws2_32.lib")
using namespace std;
//获取网站的源码
void GetWebCode(const char* szWeb,char* szCode,int nSize,int& nLen)
{
HINTERNET hOpen = NULL, hOpenUrl = NULL;
nLen = 0;
hOpen = InternetOpen("Testing", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hOpen)
{
hOpenUrl = InternetOpenUrl(hOpen, szWeb, NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (hOpenUrl)
{
Sleep(40);
DWORD dwByteRead = 0;
if (InternetReadFile(hOpenUrl, szCode, nSize, &dwByteRead))
{
assert(dwByteRead < nSize);
nLen = dwByteRead;
}
}
}
if(hOpen)
InternetCloseHandle(hOpen);
if(hOpenUrl)
InternetCloseHandle(hOpen);
}
int FindSubstr(const char* szMain,int nMainSize,const char* szSub,int nSubSize,int nBegin = 0)
{
if (!szMain || !szSub)
return -1;
if (nMainSize <= 0 || nSubSize <= 0)
return -1;
int i = 0;
for (i = nBegin; i < nMainSize; i++)
{
bool bRet = true;
for (int j = 0; j < nSubSize; j++)
{
if (szMain[i + j] != szSub[j])
{
bRet = false;
break;
}
}
if (bRet)
break;
}
return (i == nMainSize) ? -1 : i;
}
void GetWebHref(vector<string>& cHrefList,const char* szBuffer,int nLen)
{
const char* szSub = "href=";
const char* szSubEnd = "html";
int nSubLen = strlen(szSub), nSubEndlen = strlen(szSubEnd);
int nRet = 0, nEnd = 0;
int nIndex = 0;
do
{
nRet = FindSubstr(szBuffer, nLen, szSub, nSubLen, nEnd);
if (nRet != -1)
{
char szTemp[1024] = "0";
nEnd = FindSubstr(szBuffer, nLen, szSubEnd, nSubEndlen, nRet + nSubLen);
if (nEnd != -1)
{
strncpy(szTemp, szBuffer + nRet + nSubLen + 1, nEnd - nRet - 2);
cout << "[" << nIndex++ << "] -> " << szTemp << endl;
cHrefList.emplace_back(szTemp);
}
}
} while (nRet != -1 && nEnd != -1);
}
void GetWebJpg(vector<string>& cUrl,vector<string>& cJpg)
{
const int Size = 1024 * 40;
char szCode[Size] = "0";
const char* szPos = "paper-down";
const char* szBegin = "href=";
const char* szEnd = "jpg";
int nIndex = 0;
for (auto& it : cUrl)
{
//不会超过20张图像
for (int i = 1; i < 20; i++)
{
char szWeb[1024] = "0";
strncpy(szWeb, it.c_str(), it.size());
szWeb[it.size() - 5] = '\0';
sprintf(szWeb, "%s_%d.html", szWeb, i);
//获取网站源代码
int nLen = 0;
GetWebCode(szWeb, szCode, Size, nLen);
if (!nLen)
continue;
//后面没有了
const char* szNullPage = "SORRY";
if(FindSubstr(szCode,Size,szNullPage,strlen(szNullPage)) != -1)
break;
cout << "\t" << szWeb << endl;
int nPos = FindSubstr(szCode, nLen, szPos, strlen(szPos));
if (nPos != -1)
{
int nBegin = FindSubstr(szCode, nLen, szBegin, strlen(szBegin), nPos);
int nEnd = FindSubstr(szCode, nLen, szEnd, strlen(szEnd), nBegin);
if (nBegin != -1 && nEnd != -1)
{
char szTemp[1024] = "0";
strncpy(szTemp, szCode + nBegin + strlen(szBegin) + 1, nEnd - nBegin - strlen(szEnd));
cout << "\t [" << nIndex++ << "] -> " << szTemp << endl;
cJpg.emplace_back(szTemp);
}
}
}
}
}
long g_nJpg = 0;//记录图像数量
void DownLoadJpg(vector<string>& cJpg)
{
HINTERNET hOpen = NULL, hOpenUrl = NULL;
HANDLE hFile = INVALID_HANDLE_VALUE;
DWORD dwRecv = 0, dwSend = 0;
const int Size = 1024 * 40;
char szDownLoad[Size] = "0";
char szPath[1024] = "0";
hOpen = InternetOpen("Testing", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hOpen)
{
for (auto& it : cJpg)
{
hOpenUrl = InternetOpenUrl(hOpen, it.c_str(), NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (hOpenUrl)
{
long index = InterlockedIncrement(&g_nJpg);
sprintf(szPath, "Images\\%d.jpg", index);
hFile = CreateFile(szPath, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
if (hFile != INVALID_HANDLE_VALUE)
{
cout << "download [" << index << "] -> " << it.c_str() << endl;
while (true)
{
Sleep(10);
InternetReadFile(hOpenUrl, szDownLoad, Size, &dwRecv);
if (!dwRecv)
break;
WriteFile(hFile, szDownLoad, dwRecv, &dwSend, NULL);
if (!dwSend)
break;
}
CloseHandle(hFile);
hFile = INVALID_HANDLE_VALUE;
}
}
}
}
if(hOpen)
InternetCloseHandle(hOpen);
if(hOpenUrl)
InternetCloseHandle(hOpenUrl);;
}
int JpgDownLoader(const char* szWeb)
{
//先获取网站的源码
const int Size = 1024 * 40;
int nLen = 0;
char szCode[Size] = "0";
GetWebCode(szWeb, szCode, Size,nLen);
if (!nLen)
{
cout << "获取网站源码失败" << endl;
return -1;
}
const char* szNullPage = "SORRY";
if (FindSubstr(szCode, Size, szNullPage, strlen(szNullPage)) != -1)
{
cout << "空网页" << endl;
return 0;
}
//获取网站区域方便解析
const char* szBegin = strstr(szCode, "Left_bar");
const char* szEnd = strstr(szCode, "pages");
if (!szBegin || !szEnd)
{
cout << "无法解析出目标区域" << endl;
return -1;
}
//将UL标签里面的网站地址全部解析出来
vector<string> cDownloadWeb;
GetWebHref(cDownloadWeb, szBegin, szEnd - szBegin);
if (cDownloadWeb.empty())
{
cout << "获取图像目标失败" << endl;
return -1;
}
//将第一个网址去掉
cDownloadWeb.erase(cDownloadWeb.begin());
//将网址地址里面的图像地址全部解析出来
vector<string> cJpgList;
GetWebJpg(cDownloadWeb, cJpgList);
if (cJpgList.empty())
{
cout << "获取图像地址失败" << endl;
return -1;
}
//下载图像
DownLoadJpg(cJpgList);
return true;
}
void Downloading()
{
//创建目录
CreateDirectory("Images", NULL);
//输入网址
char szBuffer[1024];
cin.getline(szBuffer, 1024);
int nLen = strlen(szBuffer);
szBuffer[nLen - 5] = '\0';
int nCount = 10;
for (int i = 1; i < 10; i++)
{
char szTemp[1024];
sprintf(szTemp, "%s_%d.html", szBuffer, i);
cout << "[" << i << "] Begin Download -> " << szTemp << endl;
//开始下载
int nRet = JpgDownLoader(szTemp);
if (nRet == 0)
break;
else if (nRet == -1 && --nCount >= 0)
i--;
}
cout << "完成下载任务,下载图像数量为" << g_nJpg << "...." << endl;
}
int main(int argc,char* argv[])
{
cout << "图片下载器" << endl;
cout << "图像将保存到" << argv[0] << "\\Images" << endl;
cout << "请输入根网页路径:" << endl;
Downloading();
getchar();
return 0;
}
/************************************************************************
美桌网(http://www.win4000.com/)图片下载程序
Time:2019-11-30
By fyh
************************************************************************/
#include <Windows.h>
#include <WinInet.h>
#include <time.h>
#pragma comment(lib, "Wininet.lib")
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <future>
using namespace std;
char g_dir[1024];
long g_jpg_index = 0;
//获取网址的源代码
string get_web_code(const char* web)
{
char agent_name[1024]{ 0 };
_itoa_s(static_cast<int>(time(NULL)), agent_name, 10);
HINTERNET internet_open = NULL;
HINTERNET internet_url = NULL;
string web_code;
internet_open = InternetOpenA(agent_name, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (internet_open)
{
internet_url = InternetOpenUrlA(internet_open, web, NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (internet_url)
{
DWORD read_byte = 0;
const int max_size = 1024 * 10;
char code_buffer[max_size];
//一直读取直到读取完整个网页的源代码
while (1)
{
memset(code_buffer, 0, max_size);
if (InternetReadFile(internet_url, code_buffer, max_size, &read_byte))
{
if(read_byte == 0)
break;
web_code += code_buffer;
}
else
{
web_code.clear();
break;
}
Sleep(10);
}
}
}
if (internet_open)
InternetCloseHandle(internet_open);
if (internet_url)
InternetCloseHandle(internet_url);
return web_code;
}
//判断释放是404错误网页
bool is_error_404_page(string& web_code)
{
const char* error_page = "SORRY 404";
if (web_code.find(error_page) != string::npos) return true;
return false;
}
//或者网页列表
vector<string> get_web_list(const char* web,int num = 10)
{
int web_len = strlen(web);
char web_temp[1024];
strncpy_s(web_temp, web, web_len);
web_temp[web_len - 5] = '\0';
vector<string> web_list;
for (int i = 1; i <= num; i++)
{
char temp[1024];
sprintf_s(temp, "%s_%d.html", web_temp, i);
string web_code = get_web_code(temp);
if (is_error_404_page(web_code))break;
else web_list.push_back(temp);
}
return web_list;
}
//获取目标区域位置
void get_block_pos(string web_code,const char* beginpos,const char* endpos,int& begin,int& end)
{
string begin_pos = "Left_bar";
string end_pos = "pages";
if (beginpos) begin_pos = beginpos;
if (endpos) end_pos = endpos;
begin = web_code.find(begin_pos.c_str());
end = web_code.find(end_pos.c_str(), begin);
}
//获取目标列表
vector<string> get_target_list(string& web_code,int begin_index,int end_index)
{
const char* href = "href=";
const char* over = "html";
vector<string> target_list;
int begin = begin_index;
while(true)
{
int pos1 = 0, pos2 = 0;
pos1 = web_code.find(href, begin);
if (pos1 == string::npos)break;
pos2 = web_code.find(over, pos1);
if (pos2 == string::npos)break;
if (pos1 >= end_index || pos2 >= end_index)break;
target_list.push_back(web_code.substr(pos1 + strlen(href) + 1, (pos2 + strlen(over)) - (pos1 + strlen(href) + 1)));
begin = pos2;
}
return target_list;
}
//下载jpg图片
void download_jpg(string& web_jpg)
{
char agent_name[1024]{ 0 };
_itoa_s(static_cast<int>(time(NULL)), agent_name, 10);
HINTERNET internet_open = NULL;
HINTERNET internet_url = NULL;
HANDLE jpg_file = INVALID_HANDLE_VALUE;
internet_open = InternetOpenA(agent_name, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (internet_open)
{
internet_url = InternetOpenUrlA(internet_open, web_jpg.c_str(), NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (internet_url)
{
char jpg_temp[1024];
sprintf_s(jpg_temp, "%s//%d.jpg", g_dir, InterlockedIncrement(&g_jpg_index));
jpg_file = CreateFile(jpg_temp, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
if (jpg_file != INVALID_HANDLE_VALUE)
{
DWORD read_byte = 0;
DWORD write_byte = 0;
const int max_size = 1024 * 10;
char code_buffer[max_size];
//一直读取直到读取完整个网页的源代码
while (1)
{
if (InternetReadFile(internet_url, code_buffer, max_size, &read_byte))
{
if (read_byte == 0) break;
WriteFile(jpg_file, code_buffer, read_byte, &write_byte, NULL);
Sleep(20);
}
}
}
}
}
if (internet_open)
InternetCloseHandle(internet_open);
if (internet_url)
InternetCloseHandle(internet_url);
if (jpg_file != INVALID_HANDLE_VALUE)
CloseHandle(jpg_file);
}
void target_download(string& web)
{
const int num = 20;
char web_temp[1024];
strncpy_s(web_temp, web.c_str(), web.size());
web_temp[web.size() - 5] = '\0';
for (int i = 1; i < num; i++)
{
char temp[1024];
sprintf_s(temp, "%s_%d.html", web_temp, i);
string web_code = get_web_code(temp);
if (is_error_404_page(web_code))break;
const char* begin = "pic-large";
const char* target = "data-original=";
const char* target_end = "jpg";
int pos = web_code.find(begin);
if (pos == string::npos)continue;
int begin_index = web_code.find(target, pos);
if (begin_index == string::npos)continue;
int end_index = web_code.find(target_end, begin_index);
if (end_index == string::npos)continue;
string target_web = web_code.substr(begin_index + strlen(target) + 1, (end_index + strlen(target_end)) - (begin_index + strlen(target) + 1));
cout << "\t开始下载图片 -> " << target_web << endl;
download_jpg(target_web);
}
}
//分析网页
bool analise_web(string web,const char* beginpos,const char* endpos)
{
//获取网址源代码
string web_code = get_web_code(web.c_str());
if (web_code.empty())
{
cout << "读取网页源代码失败 -> " << web << endl;
return false;
}
//判断是不是404错误页面
if (is_error_404_page(web_code))
{
cout << "当前页面为错误页面 -> " << web << endl;
return true;
}
//获取目标区域
int begin_index = 0;
int end_index = 0;
get_block_pos(web_code, beginpos, endpos, begin_index, end_index);
if (begin_index == string::npos || end_index == string::npos)
{
cout << "解析目标区域失败 -> " << web << endl;
return false;
}
//获取目标列表
vector<string> targer_list = get_target_list(web_code, begin_index, end_index);
if (targer_list.empty())
{
cout << "获取目标连接失败 -> " << web << endl;
return false;
}
//删除第一个网页
targer_list.erase(targer_list.begin());
//对每一个目标网页进行下载
for (auto& it : targer_list)
{
target_download(it);
}
return false;
}
//开始工作
bool begin_to_work(const char* foler, const char* web, const char* beginpos = nullptr, const char* endpos = nullptr)
{
cout << "美桌网(http://www.win4000.com/)图片下载程序" << endl;
//创建文件夹保存下载的图片
strncpy_s(g_dir, foler, strlen(foler));
CreateDirectoryA(foler, NULL);
//获取网址列表
vector<string> web_list = get_web_list(web);
//analise_web(web_list[0], beginpos, endpos);
vector<shared_future<bool>> statu;
for (int i = 0; i < web_list.size(); i++)
{
shared_future<bool> ret = std::async(launch::async, analise_web, move(web_list[i]), beginpos, endpos);
statu.push_back(ret);
}
for (auto& it : statu)
it.get();
cout << "完成图片下载,成功下载图片数量为:" << g_jpg_index << endl;
return true;
}
//帮助显示
void showhelp()
{
cout << "图片下载程序用法简介:" << endl;
cout << "参数1[必须]\t保存图片的文件夹名" << endl;
cout << "参数2[必须]\t目标网址" << endl;
cout << "参数3[非必须]\t目标区域开始标识" << endl;
cout << "参数4[非必须]\t目标区域结束标识" << endl;
}
int main(int argc, char* argv[], char* env[])
{
if (argc >= 3) begin_to_work(argv[1], argv[2]);
else showhelp();
return 0;
}
![](https://img-blog.csdnimg.cn/20191124151953868.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p6eTE0NDgzMzE1ODA=,size_16,color_FFFFFF,t_70)
代码优点low,但是能爬取图像就行,代码对网页有针对性,所以要爬取不同网页的话需要自己根据指定网页修改代码