|
楼主 |
发表于 2010-12-8 16:35:00
|
显示全部楼层
也许我应该用你说的这个libxml 来解析,其实解析url的原理我懂的,可是具体问题的时候就发现要处理好真的很难,尤其是像我这种新手,不过感谢featherwit 老大, 能不能教教小弟怎么弄一个简单的拼接字符串的状态机, 能否写一个实例看看,如何可以解决我的问题,最近被爬虫弄的迷糊了,希望不吝教导。
另外附上我刚刚读取网页文本解析链接的新代码:
int parseurl(FILE *fp)
{
int c, cpos = 0, pos = 0, i, stage, x;
char a2a[10000];
char tmpurl[512];
memset(a2a, 0, sizeof(a2a));
memset(tmpurl, 0, sizeof(tmpurl));
while(!feof(fp))
{
c = fread(a2a,1,1024,fp);
/*printf("%c",c);*/
cpos++;
stage = 0;
x = 0;
for(i=0;i<(signed)strlen(a2a);i++)
{
switch(stage)
{
case 0:
if(strncmp(a2a+i,"href",4)==0 || strncmp(a2a+i,"HREF",4)==0 ) //href found
{
stage=1;
i+=4;
}
break;
case 1:
if(a2a[i]=='\"') //start '"' found
stage=2;
break;
case 2:
if(a2a[i]!='\"') //while end '"' is not found
{
tmpurl[x++]=a2a[i];
break;
}
else //end '"' found
{
stage=3;
tmpurl[x]=0;
pos++;
printf("%s\n",tmpurl);
break;
}
}
if(stage==3) //exits from for{}
break;
}
}
printf("%d\n",cpos);
printf("%d\n",pos);
return cpos;
}
int main()
{
FILE *fpt1;
fpt1 = fopen("index.html","r");
parseurl(fpt1);
fclose(fpt1);
system("pause");
return 0;
} |
|