I realise you've asked this two weeks ago, but maybe you're still looking for an answer.
At the time of writing, I think the code below is working for me. I say at the time of writing because I get the impression that some of the id
s (in the HTML received from the server) are changing periodically -- which breaks the code.
This is what I've currently got:
The code is a bit of a mess, feel free to refactor. The entry point is ScrapeAllItemsFromEbayShop
.
Option Explicit
Private Function GetUrlForShopPageN(ByVal N As Long) As String
' Should return the store URL for page N,
' where N is some 1-based page index present in the query string.
GetUrlForShopPageN = "https://www.ebay.com/str/customwheelandperformancedepot?_pgn=" & N
End Function
Private Function GetHtmlForShopPageN(ByVal webClient As WinHttp.WinHttpRequest, ByVal N As Long) As MSHTML.HTMLDocument
' Should return a HTML document representing the response of server for page N,
' where N is some 1-based page index present in the query string.
Dim targetUrl As String
targetUrl = GetUrlForShopPageN(N)
With webClient
.Open "GET", targetUrl, False
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
.send
Set GetHtmlForShopPageN = New MSHTML.HTMLDocument
GetHtmlForShopPageN.body.innerHTML = .responseText
End With
End Function
Private Function DoesShopPageNotContainResults(ByVal htmlResponse As MSHTML.HTMLDocument) As Boolean
' Should return a boolean representing whether the htmlResponse contains zero results.
DoesShopPageNotContainResults = (htmlResponse.getElementsByClassName("srp-controls").Length = 0)
End Function
Private Function GetUrlsOfItemsToScrape() As Collection
' Should return a collection of strings, representings the URLs of items.
Set GetUrlsOfItemsToScrape = New Collection
Dim webClient As WinHttp.WinHttpRequest
Set webClient = New WinHttp.WinHttpRequest
Do While True
Dim pageIndex As Long
pageIndex = pageIndex + 1
Dim htmlResponse As MSHTML.HTMLDocument
Set htmlResponse = GetHtmlForShopPageN(webClient, pageIndex)
If DoesShopPageNotContainResults(htmlResponse) Then Exit Do
Dim anchor As MSHTML.IHTMLElement
For Each anchor In htmlResponse.getElementsByClassName("s-item__link")
Debug.Assert StrComp(LCase$(Left$(anchor.getAttribute("href"), 25)), "https://www.ebay.com/itm/", vbBinaryCompare) = 0
GetUrlsOfItemsToScrape.Add anchor.getAttribute("href")
If GetUrlsOfItemsToScrape.Count > 10 Then Exit Do ' Delete this line completely once you think everything is working.
Next anchor
If (0 = (pageIndex Mod 10)) Then DoEvents
Loop
End Function
Private Function DoesTextContainAnyOf(ByVal textToCheck As String, stringsToCheck As Variant) As Boolean
' Should return a boolean representing whether any of "stringsToCheck"
' can be found within "textToCheck". Performs a case-sensitive search.
Dim i As Long
For i = LBound(stringsToCheck) To UBound(stringsToCheck)
If InStr(1, textToCheck, stringsToCheck(i), vbBinaryCompare) Then
DoesTextContainAnyOf = True
Exit For
End If
Next i
End Function
Private Function IsItemAWheelOnly(ByVal htmlResponse As MSHTML.HTMLDocument) As Boolean
' Should return True if, based on the HTML, the item is inferred to be a "wheel".
Dim itemSpecifics As MSHTML.IHTMLTableSection
Set itemSpecifics = htmlResponse.querySelector(".itemAttr tbody")
Debug.Assert Not (itemSpecifics Is Nothing)
Dim tireAndPackageIdentifiers As Variant
tireAndPackageIdentifiers = Array("tire", "section width", "aspect ratio")
Dim tableRow As MSHTML.IHTMLTableRow
For Each tableRow In itemSpecifics.Rows
Debug.Assert 0 = (tableRow.Cells.Length Mod 2)
Dim columnIndex As Long
For columnIndex = 0 To (tableRow.Cells.Length - 1) Step 2
Debug.Assert InStr(1, tableRow.Cells(columnIndex).className, "attrLabels", vbBinaryCompare)
If DoesTextContainAnyOf(LCase$(tableRow.Cells(columnIndex).innerText), tireAndPackageIdentifiers) Then Exit Function
Next columnIndex
Next tableRow
IsItemAWheelOnly = True
End Function
Private Function GetHtmlForItem(ByVal webClient As WinHttp.WinHttpRequest, ByVal urlForItem As String) As MSHTML.HTMLDocument
' Should return a HTML document representing the response of server for a given item.
With webClient
.Open "GET", urlForItem, False
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
.send
Set GetHtmlForItem = New MSHTML.HTMLDocument
GetHtmlForItem.body.innerHTML = .responseText
End With
End Function
Private Sub ScrapeAllItemsFromEbayShop()
Dim webClient As WinHttp.WinHttpRequest
Set webClient = New WinHttp.WinHttpRequest
Dim urlsOfItemsToScrape As Collection
Set urlsOfItemsToScrape = GetUrlsOfItemsToScrape()
Dim rowWriteIndex As Long
rowWriteIndex = 1 ' Skip row 1/headers
Dim destinationSheet As Worksheet
Set destinationSheet = ThisWorkbook.Worksheets("Sheet1")
destinationSheet.Cells.ClearContents
Dim columnIndexes As Collection
Set columnIndexes = New Collection
Dim urlOfItem As Variant
For Each urlOfItem In urlsOfItemsToScrape
Debug.Print urlOfItem
Dim htmlOfItemPage As MSHTML.HTMLDocument
Set htmlOfItemPage = GetHtmlForItem(webClient, urlOfItem)
Dim nameValuePairs As Collection
If IsItemAWheelOnly(htmlOfItemPage) Then
Set nameValuePairs = CreateNameValuePairsForWheelOnly(htmlOfItemPage)
Else
Set nameValuePairs = CreateNameValuePairsForWheelAndTirePackage(htmlOfItemPage)
End If
rowWriteIndex = rowWriteIndex + 1
Dim nameValuePair As Variant
For Each nameValuePair In nameValuePairs
Dim columnWriteIndex As Long
columnWriteIndex = GetColumnIndexOfHeader(columnIndexes, nameValuePair(0))
If columnWriteIndex = 0 Then
columnWriteIndex = columnIndexes.Count + 1
columnIndexes.Add columnWriteIndex, Key:=nameValuePair(0)
destinationSheet.Cells(1, columnWriteIndex).Value = nameValuePair(0)
End If
destinationSheet.Cells(rowWriteIndex, columnWriteIndex).Value = nameValuePair(1)
Next nameValuePair
DoEvents
Next urlOfItem
End Sub
Private Function CreateNameValuePairsForWheelAndTirePackage(ByVal htmlOfItemPage As MSHTML.HTMLDocument) As Collection
' Should return a collection of 2-element arrays (where each 2-element array
' represents a name-value pair).
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim targetElement As MSHTML.IHTMLElement
Set targetElement = htmlOfItemPage.getElementById("itemTitle")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Title", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("mm-saleOrgPrc")
If targetElement Is Nothing Then
Set targetElement = htmlOfItemPage.getElementById("prcIsum")
Debug.Assert Not (targetElement Is Nothing)
End If
outputCollection.Add CreateNameValuePair("Price", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("descItemNumber")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("eBay Item Number", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("desc_div")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Description HTML", targetElement.innerHTML)
Set CreateNameValuePairsForWheelAndTirePackage = outputCollection
End Function
Private Function CreateNameValuePairsForWheelOnly(ByVal htmlOfItemPage As MSHTML.HTMLDocument) As Collection
' Should return a collection of 2-element arrays (where each 2-element array
' represents a name-value pair).
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim targetElement As MSHTML.IHTMLElement
Set targetElement = htmlOfItemPage.getElementById("itemTitle")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Title", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("mm-saleOrgPrc")
If targetElement Is Nothing Then
Set targetElement = htmlOfItemPage.getElementById("prcIsum")
Debug.Assert Not (targetElement Is Nothing)
End If
outputCollection.Add CreateNameValuePair("Price", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("descItemNumber")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("eBay Item Number", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("desc_wrapper_ctr")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Description HTML", targetElement.innerHTML)
Dim itemSpecifics As MSHTML.IHTMLTableSection
Set itemSpecifics = htmlOfItemPage.querySelector(".itemAttr tbody")
Debug.Assert Not (itemSpecifics Is Nothing)
Dim tableRow As MSHTML.IHTMLTableRow
For Each tableRow In itemSpecifics.Rows
Debug.Assert 0 = (tableRow.Cells.Length Mod 2)
Dim columnIndex As Long
For columnIndex = 0 To (tableRow.Cells.Length - 1) Step 2
Debug.Assert InStr(1, tableRow.Cells(columnIndex).className, "attrLabels", vbBinaryCompare)
outputCollection.Add CreateNameValuePair(tableRow.Cells(columnIndex).innerText, tableRow.Cells(columnIndex + 1).innerText)
Next columnIndex
Next tableRow
Set CreateNameValuePairsForWheelOnly = outputCollection
End Function
Private Function CreateNameValuePair(ByVal someName As String, ByVal someValue As String) As String()
Dim outputArray(0 To 1) As String
outputArray(0) = someName
outputArray(1) = someValue
CreateNameValuePair = outputArray
End Function
Private Function GetColumnIndexOfHeader(ByVal columnIndexes As Collection, ByVal header As String) As Long
' Should return a 1-based column index associated with "header".
' If "header" does not exist within collection, 0 is returned.
On Error Resume Next
GetColumnIndexOfHeader = columnIndexes(header)
On Error GoTo 0
End Function
This code is slow for a number of reasons:
- A lot of time is spent waiting for a response from the server.
- Each item is scraped in a serial and synchronous manner.
- Output is written to the sheet one cell a time (instead of using arrays and reducing the number of read/write operations involving the sheet).
- No toggling of
Application.Calculation
or Application.ScreenUpdating
.
Once you think the code is working, you'll want to get rid of this line If GetUrlsOfItemsToScrape.Count > 10 Then Exit Do
in the GetUrlsOfItemsToScrape
function. Otherwise you won't scrape all items.
I've left DoEvents
inside the Do
loops to keep things responsive (at the cost of some performance perhaps)