I know this is not perfect way. But I would like to give hints to anyone who would like to download all links in html page.
Example usage is something like this.
Main code is below.
Helper class which represents link.
Example usage is something like this.
// only download "jpg" files HttpUtils.SaveFirstLevelLinksToFile("", Encoding.UTF8, "c:/temp/", link => link => ".jpg".Equals(Path.GetExtension(link.Link), StringComparison.OrdinalIgnoreCase) );
Main code is below.
// main entry point method public static void SaveFirstLevelLinksToFile(string baseUri, Encoding enc, string dir, Func<LinkAttr, bool> filter) { ProcessAllExtractedLinksInHtmlText(GetPage(baseUri, enc), link => { try { if (!filter(link)) return; Uri uri = ConvertToAbsoluteURL(baseUri, link.Link); var filePath = dir + uri.AbsoluteUri.GetFileName().Replace("?", ""); uri.AbsoluteUri.GetAndSaveToFile(filePath); } catch (Exception e) { Console.WriteLine(e.StackTrace); } } ); } private readonly static Regex LINK_REGEX = new Regex( @"<a\s+[^>]*href\s*=\s*(?:(?<3>'|"")(?<1>[^\3>]*?)\3|(?<1>[^\s>]+))[^>]*>(?<2>.*?)</a>", RegexOptions.IgnoreCase | RegexOptions.Compiled ); public static void ProcessAllExtractedLinksInHtmlText(string text, Action<LinkAttr> process) { LINK_REGEX.ApplyAllMatched(text, (m) => { process(new LinkAttr(m.Groups[1].Value, m.Groups[2].Value)); }); } public static void ApplyAllMatched(this Regex regex, string text, Action<Match> apply) { for (var m = regex.Match(text); m.Success; ) { apply(m); m = m.NextMatch(); } } public static string GetPage(string url, Encoding encoding) { return ProcessGet<string>(url, stream => stream.ReadAsString(encoding)); } public static T ProcessGet<T>(string url, Func<Stream, T> process) { var request = WebRequest.Create(url); using (var response = request.GetResponse()) using (var resStream = response.GetResponseStream()) { return process(resStream); } } public static Uri ConvertToAbsoluteURL(string rootUri, string relativeUri) { var relative = new Uri(relativeUri, UriKind.RelativeOrAbsolute); if (relative.IsAbsoluteUri) { return relative; } else { return new Uri(new Uri(rootUri), relative); } }
Helper class which represents link.
public class LinkAttr { public string Link { get; set; } public string Body { get; set; } public LinkAttr(string link, string body) { this.Link = link; this.Body = body; } }
コメント