Friday, April 24, 2009

Extract links from html in C#

private List ExtractLinks(string html)
{
List links = new List();

string startSquence = "<a";
string endSequence = "</a>";

html = html.ToLower();

while (html.IndexOf("<a") != -1)
{
int start = html.IndexOf(startSquence) ;
int end = html.IndexOf(endSequence, start+startSquence.Length);

//Extract the link, and add it to the list
if (end > start)
{
string link = html.Substring(start, end + endSequence.Length - start);

//Check b
if (link.Substring(1).IndexOf(startSquence) != -1)
{
html = html.Substring(start + startSquence.Length);
continue;
}

if (link != string.Empty)
{
links.Add(link);
}
}
else if (end < start)
{
html = html.Substring(start + startSquence.Length);
continue;
}
//Trim the raw data
html = html.Substring(end + endSequence.Length);
}
return links;
}

No comments: