Scrape using XMLHTTP throws error at specific class name

I am trying to scrape a site with this code to extract names and contacts ...

Sub Test()
Dim htmlDoc         As Object
Dim htmlDoc2        As Object
Dim elem            As Variant
Dim tag             As Variant
Dim dns             As String
Dim pageSource      As String
Dim pageSource2     As String
Dim url             As String
Dim row             As Long

row = 2
dns = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/"

With CreateObject("MSXML2.XMLHTTP")
    .Open "GET", dns, True
    .send

    While .readyState <> 4: DoEvents: Wend

    If .statusText <> "OK" Then
        MsgBox "ERROR" & .Status & " - " & .statusText, vbExclamation
        Exit Sub
    End If

    pageSource = .responseText
End With

Set htmlDoc = CreateObject("htmlfile")
htmlDoc.body.innerHTML = pageSource

Dim xx 'Got error here Set xx = htmlDoc.getElementsByClassName("ldb-contact-summary")

Set htmlDoc = Nothing
Set htmlDoc2 = Nothing
End Sub

When trying to use this line

Set xx = htmlDoc.getElementsByClassName("ldb-contact-summary")

I got an error 'Object doesn't support that property or method' (438) Can you help me please as I am not so good at scraping issues?

2 answers

  • answered 2018-10-16 19:01 SIM

    To get the names and their corresponding phone numbers, you can try the below snippet:

    Sub GetProfileInfo()
        Const URL$ = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page="
        Dim Http As New XMLHTTP60, Html As New HTMLDocument
        Dim post As HTMLDivElement, R&, P&
    
        For p = 1 To 3 'put here the highest number you wanna traverse
            With Http
                .Open "GET", URL & p, False
                .send
                Html.body.innerHTML = .responseText
            End With
    
            For Each post In Html.getElementsByClassName("ldb-contact-summary")
                With post.querySelectorAll(".ldb-contact-name a")
                    If .Length Then R = R + 1: Cells(R, 1) = .item(0).innerText
                End With
    
                With post.getElementsByClassName("ldb-phone-number")
                    If .Length Then Cells(R, 2) = .item(0).innerText
                End With
            Next post
        Next p
    End Sub
    

    Reference to add to the library to execute the above script:

    Microsoft xml, v6.0
    Microsoft Html Object Library
    

  • answered 2018-10-16 20:06 QHarr

    As you mention all the pages in the comment above I will use a class to hold the XMLHTTP object and provide it with methods to extract the data, whilst incorporating a method to find the number of results pages and loop them. Testing this gave me 251 rows of results.

    Note: It seems you experience the requirement for human verification. I wrote a selenium version but it seems too many requests earn you a temporary block. Weirdly, my XMLHTTP version still runs for me.

    Class clsHTTP

    Option Explicit    
    Private http As Object
    
    Private Sub Class_Initialize()
        Set http = CreateObject("MSXML2.XMLHTTP")
    End Sub
    
    Public Function GetString(ByVal url As String) As String
        Dim sResponse As String
        With http
            .Open "GET", url, False
            .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
            .send
            sResponse = StrConv(.responseBody, vbUnicode)
            GetString = sResponse
        End With
    End Function
    
    Public Function GetInfo(ByVal html As HTMLDocument) As Variant
    
        Dim names As Object, telNums As Object, i As Long, namesArray(), telsArray()
        Set names = html.querySelectorAll("[class*='ldb-contact-name']")
        Set telNums = html.querySelectorAll(".ldb-phone-number")
    
        ReDim namesArray(0 To names.Length - 1)
        ReDim telsArray(0 To telNums.Length - 1)
    
        For i = 0 To names.Length - 1
            namesArray(i) = names.item(i).innerText
            telsArray(i) = telNums.item(i).innerText
        Next     
        GetInfo = Array(namesArray, telsArray)
    End Function
    

    Standard module 1

    Option Explicit
    Public Sub GetReviewData()
        Dim sResponse As String, html As HTMLDocument, http As clsHTTP
        Dim numPages As Long, pageNum As Long, url As String
        Dim results As Collection, item As Variant, ws As Worksheet
    
        url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/"
        Set http = New clsHTTP
        Set html = New HTMLDocument
        Set results = New Collection
        Set ws = ThisWorkbook.Worksheets("Sheet1")
    
        With html
            .body.innerHTML = http.GetString(url)           
            numPages = .querySelectorAll("[data-idx]").item(html.querySelectorAll("[data-idx]").Length - 2).innerText            
            results.Add http.GetInfo(html)
    
            If numPages > 1 Then
                For pageNum = 2 To numPages
                    url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=" & pageNum & "&showAdvancedItems=false&regionID=17762&locationText=Detroit%20MI"
                    .body.innerHTML = http.GetString(url)
                    results.Add http.GetInfo(html)
                Next
            End If
    
            Dim numResults As Long
            If results.Count > 0 Then
                Application.ScreenUpdating = False
                For Each item In results
                    numResults = UBound(item(0)) + 1
                    With ws
                        .Cells(GetLastRow(ws, 1) + 1, 1).Resize(numResults, 1) = Application.Transpose(item(0))
                        .Cells(GetLastRow(ws, 2) + 1, 2).Resize(numResults, 1) = Application.Transpose(item(1))
                    End With
                Next
                Application.ScreenUpdating = True
            End If
        End With
    
    End Sub
    
    Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
        With ws
            GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
        End With
    End Function
    

    Selenium:

    Option Explicit
    
    Public Sub GetReviewData()
        Dim html As HTMLDocument
        Dim numPages As Long, pageNum As Long, url As String
        Dim results As Collection, item As Variant, ws As Worksheet
        Dim d As WebDriver, elements As WebElements
    
        url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=1&showAdvancedItems=false&regionID=17762&locationText=Detroit%20MI"
        Set html = New HTMLDocument
        Set results = New Collection
        Set ws = ThisWorkbook.Worksheets("Sheet1")
    
        Set d = New ChromeDriver
        With d
            .Start "Chrome"
            .get url
    
            Set elements = .FindElementsByCss("[data-idx]")
            numPages = elements(elements.Count - 1).Text
            html.body.innerHTML = .PageSource
            results.Add GetInfo(html)
    
            If numPages > 1 Then
                For pageNum = 2 To numPages
    
                    Application.Wait Now + TimeSerial(0, 0, 2)
                    url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=" & pageNum & "&showAdvancedItems=false&regionID=17762&locationText=Detroit%20MI"
                    .get url
                    html.body.innerHTML = .PageSource
                    results.Add GetInfo(html)
                Next
            End If
    
            Dim numResults As Long
            If results.Count > 0 Then
                Application.ScreenUpdating = False
                For Each item In results
                    numResults = UBound(item(0)) + 1
                    With ws
                        .Cells(GetLastRow(ws, 1) + 1, 1).Resize(numResults, 1) = Application.Transpose(item(0))
                        .Cells(GetLastRow(ws, 2) + 1, 2).Resize(numResults, 1) = Application.Transpose(item(1))
                    End With
                Next
                Application.ScreenUpdating = True
            End If
        End With
    End Sub
    
    Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
        With ws
            GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
        End With
    End Function
    
    Public Function GetInfo(ByVal html As HTMLDocument) As Variant
    
        Dim names As Object, telNums As Object, i As Long, namesArray(), telsArray()
        Set names = html.querySelectorAll("[class*='ldb-contact-name']")
        Set telNums = html.querySelectorAll(".ldb-phone-number")
    
        ReDim namesArray(0 To names.Length - 1)
        ReDim telsArray(0 To telNums.Length - 1)
    
        For i = 0 To names.Length - 1
            namesArray(i) = names.item(i).innerText
            telsArray(i) = telNums.item(i).innerText
        Next
    
        GetInfo = Array(namesArray, telsArray)
    End Function