본문 바로가기

Language/파이썬

Building a Job Scrapper(5)

 

 

 

#2.8 Extracting Locations and Finishing up

 

 

 

# 반복될 수 있는 부분을 함수로 만들어 둔다.

def extract_job(html):                  #이 함수를 만들었다. html이 result 이다.

title = html.find("div",{"class":"title"}).find("a")["title"]

 

company = html.find("span", {"class":"company"})

 

company_anchor = company.find("a")

 

if company.find("a") is not None:

company = str(company_anchor.string)

else:

company = str(company.string)

company = company.strip()

 

return{'title':title,'company':company}



def extract_indeed_jobs(last_page):

jobs = []

 

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:         

job = extract_job(result)     #result가 html이다.

print(job)

 

return jobs

 

 

 

 

 

 

 

 

 

 

 

def extract_indeed_jobs(last_page):

jobs = []

#for page in range(last_page):

result = requests.get(f"{URL}&start={0*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

for result in results:

job = extract_job(result)

jobs.append(job)

return jobs

 

 

main.py

 

from indeed import extract_indeed_pages, extract_indeed_jobs

 

last_indeed_page = extract_indeed_pages()

 

indeed_jpbs = extract_indeed_jobs(last_indeed_page)

 

print(indeed_jpbs)  

 

 

 

 

 

#최종

 

import requests

from bs4 import BeautifulSoup

 

LIMIT = 50

 

URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

 

def extract_indeed_pages():

 

result = requests.get(URL)

 

soup = BeautifulSoup(result.text, "html.parser")

 

pagination = soup.find("div", {"class": "pagination"})

 

links = pagination.find_all('a')

 

pages = []

 

for link in links[:-1]:

pages.append(int(link.string))

#print(spans)

#pages = pages[0:-1]

 

max_page = pages[-1]

 

return(max_page)

 

def extract_job(html):

title = html.find("div",{"class":"title"}).find("a")["title"]

 

company = html.find("span", {"class":"company"})

 

company_anchor = company.find("a")

 

if company.find("a") is not None:

company = str(company_anchor.string)

else:

company = str(company.string)

company = company.strip()

 

location = html.find("div",{"class":"recJobLoc"})["data-rc-loc"]

 

job_id = html["data-jk"]

 

return{

'title':title,

'company':company,

'location':location,

"link":f"https://www.indeed.com/viewjob?jk={job_id}"

}



def extract_indeed_jobs(last_page):

jobs = []

 

for page in range(last_page):

print(f"Scrapping pge{page}")        #추가

result = requests.get(f"{URL}&start={page*LIMIT}")

soup = BeautifulSoup(result.text, "html.parser")

 

results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})

 

for result in results:

job = extract_job(result)

jobs.append(job)

 

return jobs

 



 

 

 



'Language > 파이썬' 카테고리의 다른 글

Building a Job Scrapper(4)  (0) 2020.01.04
Building a Job Scrapper(3)  (0) 2020.01.04
Building a Job Scrapper(2)  (0) 2020.01.04
Building a Job Scrapper  (0) 2020.01.02
Data Analysis/데이터로 그래프그리기  (0) 2018.01.05