Introduction:
Welcome to my tutorial on how to create a simple program to grab the information of a video off of YouTube. It will scrape:
Title
Description
View Count
Like Count
Dislike Count
You can also make it scrape the comments, related videos, uploader etc.
Steps of Creation:
Step 1:
The first thing we need to do is import System.Net so we can use HttpWebRequest and Response, we also need ReGex and a pre-made function to extract a String between two String points:
Imports System.Net
Imports System.Text.RegularExpressions
Private Function GetBetween(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String, Optional ByVal Index As Integer = 0) As String
Return Regex.Split(Regex.Split(Source, Str1)(Index + 1), Str2)(0)
End Function
Step 2:
Now add a textbox to your form to contain the video url and a button to begin the process. In the button click event we will first open a savefiledialog box to select a saving path and verify the url:
Dim fs As SaveFileDialog = New SaveFileDialog
fs.RestoreDirectory = True
fs.Filter = "txt files (*.txt)|*.txt"
fs.FilterIndex = 1
fs.ShowDialog()
Dim url As String = ""
If (TextBox1.Text.ToLower().Contains("youtube")) Then
If (TextBox1.Text.ToLower().StartsWith("http://") Or TextBox1.Text.ToLower().StartsWith("https://")) Then
url = TextBox1.Text
Else
If (TextBox1.Text.ToLower().StartsWith("www.")) Then
url = "http://" & TextBox1.Text
Else
url = "http://www." & TextBox1.Text
End If
End If
ElseIf (TextBox1.Text.ToLower().StartsWith("/watch")) Then
url = "http://www.youtube.com" & TextBox1.Text.ToLower()
ElseIf (TextBox1.Text.ToLower().StartsWith("watch")) Then
url = "http://www.youtube.com/" & TextBox1.Text.ToLower()
End If
Step 3:
Now, we want to send a request to the url and read the response as our source page String. Then we want to extract each piece of information and run it through a custom function which we will make in a minute:
Dim r As HttpWebRequest = HttpWebRequest.Create(url)
Dim re As HttpWebResponse = r.GetResponse
Dim src As String = New System.IO.StreamReader(re.GetResponseStream()).ReadToEnd()
Dim title2 As String = GetBetween(src, "<span id=""eow-title""", ">")
Dim title As String = GetBetween(title2, "title=""", """")
Dim desc As String = GetBetween(src, "<p id=""eow-description"" >", "</p>")
Dim likes As String = GetBetween(src, "<span class=""likes-count"">", "</span")
Dim dislikes As String = GetBetween(src, "<span class=""dislikes-count"">", "</span")
Dim views As String = GetBetween(src, "<span class=""watch-view-count "" >", "</span")
title = removeExtras(title, False)
desc = removeExtras(desc, False)
likes = removeExtras(likes)
dislikes = removeExtras(dislikes)
views = removeExtras(views)
Using sw As New System.IO.StreamWriter(fs.FileName)
sw.WriteLine(title)
sw.WriteLine(desc)
sw.WriteLine("Likes: " & likes)
sw.WriteLine("Dislikes: " & dislikes)
sw.WriteLine("Total Views: " & views)
End Using
Once we have parsed all the information we want to write the information to the save path.
Step 4:
Now, for the custom function. This is just a function to:
- Remove spaces from the view count, like count and dislike count.
- Replace " from HTML with a quotation mark (").
- Replace ' from HTML with an apostrophe (').
- Remove HTML tags, mainly from the description (links etc).
Private Function removeExtras(ByVal s As String, Optional ByVal removeSpaces As Boolean = True)
Dim ret As String = s
If (s.Contains(" ") And removeSpaces) Then
ret = ""
For Each c As String In s
If (Not c = " ") Then ret &= c
Next
End If
If (ret.Contains("<") And ret.Contains(">")) Then
Dim sa As Boolean = True
Dim temp As String = ""
For Each c As String In ret
If (c = "<") Then sa = False
If (c = ">") Then sa = True
If (Not c = "<" And Not c = ">" And sa) Then
temp &= c
End If
Next
ret = temp
End If
If (ret.Contains(""")) Then
ret = ret.Replace(""", """")
End If
If (ret.Contains("'")) Then ret = ret.Replace("'", "'")
Return ret
End Function
Project Complete!
Below is the full source code and a download to the project files:
Imports System.Net
Imports System.Text.RegularExpressions
Public Class Form1
Private Function GetBetween(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String, Optional ByVal Index As Integer = 0) As String
Return Regex.Split(Regex.Split(Source, Str1)(Index + 1), Str2)(0)
End Function
Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
Dim fs As SaveFileDialog = New SaveFileDialog
fs.RestoreDirectory = True
fs.Filter = "txt files (*.txt)|*.txt"
fs.FilterIndex = 1
fs.ShowDialog()
Dim url As String = ""
If (TextBox1.Text.ToLower().Contains("youtube")) Then
If (TextBox1.Text.ToLower().StartsWith("http://") Or TextBox1.Text.ToLower().StartsWith("https://")) Then
url = TextBox1.Text
Else
If (TextBox1.Text.ToLower().StartsWith("www.")) Then
url = "http://" & TextBox1.Text
Else
url = "http://www." & TextBox1.Text
End If
End If
ElseIf (TextBox1.Text.ToLower().StartsWith("/watch")) Then
url = "http://www.youtube.com" & TextBox1.Text.ToLower()
ElseIf (TextBox1.Text.ToLower().StartsWith("watch")) Then
url = "http://www.youtube.com/" & TextBox1.Text.ToLower()
End If
Dim r As HttpWebRequest = HttpWebRequest.Create(url)
Dim re As HttpWebResponse = r.GetResponse
Dim src As String = New System.IO.StreamReader(re.GetResponseStream()).ReadToEnd()
Dim title2 As String = GetBetween(src, "<span id=""eow-title""", ">")
Dim title As String = GetBetween(title2, "title=""", """")
Dim desc As String = GetBetween(src, "<p id=""eow-description"" >", "</p>")
Dim likes As String = GetBetween(src, "<span class=""likes-count"">", "</span")
Dim dislikes As String = GetBetween(src, "<span class=""dislikes-count"">", "</span")
Dim views As String = GetBetween(src, "<span class=""watch-view-count "" >", "</span")
title = removeExtras(title, False)
desc = removeExtras(desc, False)
likes = removeExtras(likes)
dislikes = removeExtras(dislikes)
views = removeExtras(views)
Using sw As New System.IO.StreamWriter(fs.FileName)
sw.WriteLine(title)
sw.WriteLine(desc)
sw.WriteLine("Likes: " & likes)
sw.WriteLine("Dislikes: " & dislikes)
sw.WriteLine("Total Views: " & views)
End Using
End Sub
Private Function removeExtras(ByVal s As String, Optional ByVal removeSpaces As Boolean = True)
Dim ret As String = s
If (s.Contains(" ") And removeSpaces) Then
ret = ""
For Each c As String In s
If (Not c = " ") Then ret &= c
Next
End If
If (ret.Contains("<") And ret.Contains(">")) Then
Dim sa As Boolean = True
Dim temp As String = ""
For Each c As String In ret
If (c = "<") Then sa = False
If (c = ">") Then sa = True
If (Not c = "<" And Not c = ">" And sa) Then
temp &= c
End If
Next
ret = temp
End If
If (ret.Contains(""")) Then
ret = ret.Replace(""", """")
End If
If (ret.Contains("'")) Then ret = ret.Replace("'", "'")
Return ret
End Function
End Class