본문 바로가기
카테고리 없음

서울교차로(newspaper.seoulkcr.com)구인 크롤링

by IT HUB 2020. 7. 16.
728x90
반응형

 

서울교차로(newspaper.seoulkcr.com)구인 크롤링을 해보자

 

지식인 답변내용입니다.

 

Private Sub program1472_com()

    Dim elem As Object

    Dim oHTML As Object

    Set oHTML = CreateObject("htmlfile")

    Dim URL As String, Cookie As String, T As String, P As Integer

    Cells.Clear

    Do

        P = P + 1

        URL = "http://newspaper.seoulkcr.com/paper?kcrid=kangnamkcr&cateid=0001&page=" & P

        With CreateObject("WinHttp.WinHttpRequest.5.1")

            .Open "GET", URL

            .SetRequestHeader "Accept""text/html, application/xhtml+xml, */*"

            .SetRequestHeader "Accept-Language""ko-KR"

            .SetRequestHeader "User-Agent""Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"

            .SetRequestHeader "Host""newspaper.seoulkcr.com"

            .SetRequestHeader "Connection""Keep-Alive"

            If Len(Cookie) Then .SetRequestHeader "Cookie", Cookie

            .Send

            .WaitForResponse: DoEvents

            T = .ResponseText

            'T = StrConv(.ResponseBody, vbUnicode)

        End With

        oHTML.body.innerHTML = T

        Dim O As Object, i As Integer

        Set O = oHTML.getElementById("container")

        Set elem = getXPathElement("/div/div/ul", O)

        Dim V(2As Variant

        For i = 1 To 20

            Set O = getXPathElement("/li[" & i & "]", elem)

            If O Is Nothing Then GoTo ExitSub

            V(0= (P - 1* 20 + i

            V(1= getXPathElement("/ul/li[1]", O).innerTEXT

            V(2= getXPathElement("/ul/li[2]", O).innerTEXT

            Cells(Rows.Count, "A").End(3)(2).Resize(, 3).Value = V

        Next

    Loop

ExitSub:

    MsgBox "완료!!"

End Sub

 

Public Function getXPathElement(sXPath As String, objElement As ObjectAs Object

    On Error GoTo ErrPass

    Dim sXPathArray() As String

    Dim sNodeName As String, sNodeNameIndex As String

    Dim sRestOfXPath As String

    Dim lNodeIndex As Long, lCount As Long

    sXPathArray = Split(sXPath, "/")

    sNodeNameIndex = sXPathArray(1)

    If Not InStr(sNodeNameIndex, "["> 0 Then

        sNodeName = sNodeNameIndex

        lNodeIndex = 1

    Else

        sXPathArray = Split(sNodeNameIndex, "[")

        sNodeName = sXPathArray(0)

        lNodeIndex = CLng(Left(sXPathArray(1), Len(sXPathArray(1)) - 1))

    End If

    sRestOfXPath = Right(sXPath, Len(sXPath) - (Len(sNodeNameIndex) + 1))

    Set getXPathElement = Nothing

    For lCount = 0 To objElement.ChildNodes().Length - 1

        If UCase(objElement.ChildNodes().Item(lCount).nodeName) = UCase(sNodeName) Then

            If lNodeIndex = 1 Then

                If sRestOfXPath = "" Then

                    Set getXPathElement = objElement.ChildNodes().Item(lCount)

                Else

                    Set getXPathElement = getXPathElement(sRestOfXPath, objElement.ChildNodes().Item(lCount))

                End If

            End If

            lNodeIndex = lNodeIndex - 1

        End If

    Next lCount

ErrPass:

End Function

 

 

아래와 같은 결과를 뽑아냈다.

 

 

 

 

무엇보다 크롤링에는 많은 경험과 노하우가 필요한 분야중 하나라고 생각한다.

 

이와 같은 작업에 앞서 많은 사람들은 간단히 작업이 되는 줄 안다.

 

아무튼 도움이 되었길 바라면서....

 

반응형

댓글