Visual Basic Email Extractor

Introduction: Today we will be making a simple program to extract all emails within a web page source. We will be doing it manually rather than using Regex. Notes: This will require a webpage which contains one or more emails. I will be using this list for testing: http://pastebin.com/KBzSZVgh Steps of Creation: Step 1: First we are going to create a new project with: 1 Text-boxes - URL 1 Button - Begin new thread (which will begin the extractor script) 1 Listbox - Contain emails Step 2: Next we need to Import four packages. One for creating a request and receiving a response to and from the web page, another to read the response and the final one to create a new thread and the last for Regex.
  1. Imports System.IO
  2. Imports System.Net
  3. Imports System.Text.RegularExpressions
  4. Imports System.Threading
Step 3: The next step we want to add a function which we will use to get the text between all the HTML tags. We will be splitting the source by a space to get all the words. Then check each word if it contains appropriate signs ("@", ".") and if it does it may contain tags so we need to remove them.
  1. Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
  2. Dim Results, T As New List(Of String)
  3. T.AddRange(Regex.Split(Source, Str1))
  4. T.RemoveAt(0)
  5. For Each I As String In T
  6. Results.Add(Regex.Split(I, Str2)(0))
  7. Next
  8. Return Results.ToArray
  9. End Function
Step 4: In the button one click event we are going to create a new thread which will start a function named "extract".
  1. Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
  2. Dim trd As thread = New thread(AddressOf extract)
  3. trd.isbackground = True
  4. trd.start()
  5. End Sub
Step 5: Now lets check the URL in textbox1 to see if it is valid, if it is lets create a request, receive the response and read it to gain the web page source.
  1. Private Function extract()
  2. If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
  3. Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
  4. r.KeepAlive = True
  5. r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
  6. Dim re As HttpWebResponse = r.GetResponse()
  7. Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
  8. Else : MsgBox("That is not a valid link!")
  9. End If
  10. End Function
Step 6: Once we have got the source lets split it by a space to get each "word"/tag and check to see if each could be an email by looking for the "@" and "." signs.
  1. Private Function extract()
  2. If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
  3. Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
  4. r.KeepAlive = True
  5. r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
  6. Dim re As HttpWebResponse = r.GetResponse()
  7. Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
  8. Dim words As String() = src.Split(" ")
  9. For Each word As String In words
  10. If (word.Contains("@") And word.Contains(".")) Then
  11. End If
  12. Next
  13. Else : MsgBox("That is not a valid link!")
  14. End If
  15. End Function
Step 7: Now lets check to see if that particular word contains a "" which means it could be a tag. We don't want the tags so lets get the String between ">" and "Step 8: Finally, lets check how many emails there are. If there are more than one in the "toAdd" List, lets just add it to our listbox, otherwise lets iterate through them all and add them all to the listbox.
  1. Private Function extract()
  2. If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
  3. Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
  4. r.KeepAlive = True
  5. r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
  6. Dim re As HttpWebResponse = r.GetResponse()
  7. Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
  8. Dim words As String() = src.Split(" ")
  9. For Each word As String In words
  10. If (word.Contains("@") And word.Contains(".")) Then
  11. If (word.Contains("<") And word.Contains(">")) Then
  12. Dim toAdd As New List(Of String)
  13. Dim noTags As String() = GetBetweenAll(word, ">", "<")
  14. For Each w As String In noTags
  15. If (w.Contains("@") And w.Contains(".") And Not w.Contains("=")) Then
  16. If (w.EndsWith(",") Or w.EndsWith(".")) Then
  17. toAdd.Add(w.Substring(0, w.Length - 1))
  18. Else
  19. toAdd.Add(w)
  20. End If
  21. End If
  22. Next
  23. If (toAdd.Count > 0) Then
  24. If (toAdd.Count > 1) Then
  25. For Each t As String In toAdd
  26. ListBox1.Items.Add(t)
  27. Next
  28. Else
  29. ListBox1.Items.Add(toAdd(0))
  30. End If
  31. End If
  32. Else
  33. ListBox1.Items.Add(word)
  34. End If
  35. End If
  36. Next
  37. Else : MsgBox("That is not a valid link!")
  38. End If
  39. End Function
Important! To be able to access the controls from a new thread we need to set CheckForIllegalCrossThreadCalls to False in the Form1_load event. Remove Duplicates Function Lets add one last feature to remove all duplicate emails. This is very simple and just iterates through each item in the listbox1 and checks against a newly created list. If it is already in the list it won't add it otherwise it will. Then it simply iterates through the new list to add them all back in to the listbox1 (After clearing it first of course!) Of course, you could add this as a new thread as well so it doesn't temporarily crash the UI while processing.
  1. Private Sub Button3_Click(sender As Object, e As EventArgs) Handles Button3.Click
  2. Dim items As New List(Of String)
  3. For Each i As String In ListBox1.Items
  4. Dim isNew As Boolean = True
  5. For Each it As String In items
  6. If (it = i) Then isNew = False
  7. Next
  8. If (isNew) Then items.Add(i)
  9. Next
  10. ListBox1.Items.Clear()
  11. For Each i As String In items
  12. ListBox1.Items.Add(i)
  13. Next
  14. End Sub
Project Completed! That's it! Here is the finished source:
  1. Imports System.IO
  2. Imports System.Net
  3. Imports System.Text.RegularExpressions
  4. Imports System.Threading
  5. Public Class Form1
  6.  
  7. Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
  8. Dim Results, T As New List(Of String)
  9. T.AddRange(Regex.Split(Source, Str1))
  10. T.RemoveAt(0)
  11. For Each I As String In T
  12. Results.Add(Regex.Split(I, Str2)(0))
  13. Next
  14. Return Results.ToArray
  15. End Function
  16.  
  17. Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button2.Click
  18. Dim trd As Thread = New Thread(AddressOf extract)
  19. trd.IsBackground = True
  20. trd.Start()
  21. End Sub
  22.  
  23. Private Function extract()
  24. If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
  25. Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
  26. r.KeepAlive = True
  27. r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
  28. Dim re As HttpWebResponse = r.GetResponse()
  29. Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
  30. Dim words As String() = src.Split(" ")
  31. For Each word As String In words
  32. If (word.Contains("@") And word.Contains(".")) Then
  33. If (word.Contains("<") And word.Contains(">")) Then
  34. Dim toAdd As New List(Of String)
  35. Dim noTags As String() = GetBetweenAll(word, ">", "<")
  36. For Each w As String In noTags
  37. If (w.Contains("@") And w.Contains(".") And Not w.Contains("=")) Then
  38. If (w.EndsWith(",") Or w.EndsWith(".")) Then
  39. toAdd.Add(w.Substring(0, w.Length - 1))
  40. Else
  41. toAdd.Add(w)
  42. End If
  43. End If
  44. Next
  45. If (toAdd.Count > 0) Then
  46. If (toAdd.Count > 1) Then
  47. For Each t As String In toAdd
  48. ListBox1.Items.Add(t)
  49. Next
  50. Else
  51. ListBox1.Items.Add(toAdd(0))
  52. End If
  53. End If
  54. Else
  55. ListBox1.Items.Add(word)
  56. End If
  57. End If
  58. Next
  59. Else : MsgBox("That is not a valid link!")
  60. End If
  61. End Function
  62.  
  63. Private Function removeTags(ByVal w As String)
  64. Dim toReturn As New List(Of String)
  65. Dim noTags As String() = GetBetweenAll(w, ">", "<")
  66. For Each word As String In noTags
  67. If (word.Contains("@") And word.Contains(".") And Not word.Contains("=")) Then
  68. toReturn.Add(word)
  69. End If
  70. Next
  71. Return toReturn
  72. End Function
  73.  
  74. Private Sub Form1_Load(sender As Object, e As EventArgs) Handles Me.Load
  75. CheckForIllegalCrossThreadCalls = False
  76. End Sub
  77.  
  78. Private Sub Button3_Click(sender As Object, e As EventArgs) Handles Button3.Click
  79. Dim items As New List(Of String)
  80. For Each i As String In ListBox1.Items
  81. Dim isNew As Boolean = True
  82. For Each it As String In items
  83. If (it = i) Then isNew = False
  84. Next
  85. If (isNew) Then items.Add(i)
  86. Next
  87. ListBox1.Items.Clear()
  88. For Each i As String In items
  89. ListBox1.Items.Add(i)
  90. Next
  91. End Sub
  92. End Class

Comments

Submitted byJerry Gonzalez (not verified)on Tue, 04/07/2015 - 01:38

Function 'extract' doesn't return a value on all code paths. A null reference exception could occur at run time when the result is used.

Add new comment