본문 바로가기

Language/파이썬

Building a Job Scrapper(3)

 

노마드코더님 감사합니다.^^

#2.6 Extracting Titles  https://academy.nomadcoders.co/courses/681401/lectures/12171971

 

 

 

 

 

import requests

from bs4 import BeautifulSoup

 

LIMIT = 50

 

URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

 

def extract_indeed_pages():

 

result = requests.get(URL)

 

soup = BeautifulSoup(result.text, "html.parser")

 

pagination = soup.find("div", {"class": "pagination"})

 

links = pagination.find_all('a')

 

pages = []

 

for link in links[:-1]:

pages.append(int(link.string))

#print(spans)

#pages = pages[0:-1]

 

max_page = pages[-1]

 

return(max_page)



def extract_indeed_jobs(last_page):

jobs = []

 

for page in range(last_page):

result = requests.get(f"{URL}&start={page*LIMIT}")

 

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

print(results)

 

return jobs

 

 

 

result = soup.find_all( "div" , { "class" :  "jobsearch-SerpJobCard" }) 실행

<참고>

실행하면 매 20페이지씩 출력되므로 엄청나다. 왠만하면 실행하지 않는 것이.................ㅎㅎ

 

 

 

 

 

#그래서 한장만 출력하도록 수정..........

 

import requests

from bs4 import BeautifulSoup

 

LIMIT = 50

 

URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

 

def extract_indeed_pages():

 

result = requests.get(URL)

 

soup = BeautifulSoup(result.text, "html.parser")

 

pagination = soup.find("div", {"class": "pagination"})

 

links = pagination.find_all('a')

 

pages = []

 

for link in links[:-1]:

pages.append(int(link.string))

#print(spans)

#pages = pages[0:-1]

 

max_page = pages[-1]

 

return(max_page)



def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")   #start={page*LIMIT}")을 수정함 

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

print(results)

 

return jobs

 

<참고: 1장분만 출력돼 나온다^^>

 

 

 

 

 

 

#result는 soup의 리스트이다. title부분을 출력해보면.....

 

def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

 

    print(result.find_all("div",{"class":"title"}))

 

return jobs

 

 

    print (result.find_all( "div" ,{ "class" : "title" })) 출력

<참고: 타이틀이 들어간 부분만으로 출력된다. >

 

 

 

 

 

 

#result는 soup의 리스트이다. title로 바꾸고 앵커 a를  출력해보면.....

 

def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

 

   title = result.find_all("div",{"class":"title"})

 

   print(title.find("a"))

 

return jobs

 

title = result.find_all( "div" ,{ "class" : "title" })

 

 

# title = result.find("div",{"class":"title"})로 수정해서 다시 출력. 왜냐면 다음 작업에서 에러가^^

def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

 

title = result.find("div",{"class":"title"})

 

print(title.find("a"))



return jobs

 

 

 

 

 

# title을 넣어서.  soup.find_all로 작업하면 에러출력하니 조심ㅠㅠ

 

def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

 

title = result.find("div",{"class":"title"})  #soup.find_all로 작업하면 에러출력하니 조심ㅠㅠ

 

anchor = title.find("a")["title"]

 

print(anchor)



return jobs

 

anchor = title.find( "a" )[ "title" ]

 

 

#코드를 한줄로 줄여보자

def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

 

title = result.find("div",{"class":"title"}).title.find("a")["title"]  #이곳 처럼 하면 에러 작렬^^

 

print(title)



return jobs



 

# 다시

def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

 

title = result.find("div",{"class":"title"}).find("a")["title"]

 

print(title)



return jobs

 

<출력: ok>

'Language > 파이썬' 카테고리의 다른 글

Building a Job Scrapper(5)  (0) 2020.01.04
Building a Job Scrapper(4)  (0) 2020.01.04
Building a Job Scrapper(2)  (0) 2020.01.04
Building a Job Scrapper  (0) 2020.01.02
Data Analysis/데이터로 그래프그리기  (0) 2018.01.05