import requests
from bs4 import BeautifulSoup
indeed_result = requests.get('https://www.indeed.com/jobs?q=python&limit=50')
#print(indeed_result.text)
indeed_soup = BeautifulSoup(indeed_result.text, "html.parser")
print(indeed_soup)
pagination = indeed_soup.find("div", {"class": "pagination"})
#print(pagination)
links = pagination.find_all('a')
#print(pages)
pages = []
for link in links:
pages.append(link.find("span").string)
#print(spans)
pages = pages[0:-1]
print(pages)
# 마지막에 출력된 next를 빼고 숫자만 나오도록 수정
links = pagination.find_all('a')
#print(pages)
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
print(pages)
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
max_page = pages[-1] #최대 20페이지를 찾아준다는 의미
range()함수: 넣은수 만큼의 배열을 만들어 준다.
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
max_page = pages[-1] #20페이지를 찾아준다는 의미
for n in range(max_page):
print(n)
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
max_page = pages[-1] #20페이지를 찾아준다는 의미
for n in range(max_page):
print(f"start ={n*50}")
노마드 강의 #2 5 Requesting Each Pag
#indeed.py 파일을 새로 작성
import requests
from bs4 import BeautifulSoup
INDEED_URL = "https://www.indeed.com/jobs?q=python&limit=50"
def extract_indeed_pages():
result = requests.get(INDEED_URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
max_page = pages[-1]
return(max_page)
# main.py는 다음처럼 수정한다.
from indeed import extract_indeed_pages
max_indeed_pages = extract_indeed_pages()
print(max_indeed_pages)
#함수를 추가한다.
main.py
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
extract_indeed_jobs(last_indeed_page)
indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
max_page = pages[-1]
return(max_page)
def extract_indeed_jobs(last_page):
for page in range(last_page):
print(f"&start={page*LIMIT}") # 이부분에서 =을 빠뜨려서 오류발생함 ㅠㅠ
request 페이지 요청하기
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
#print(spans)
#pages = pages[0:-1]
max_page = pages[-1]
return(max_page)
def extract_indeed_jobs(last_page):
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
print(result.status_code) # status.code 라고 입력, 오류발생 ㅠㅠ
main.py 수정
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
indeed_jpbs = extract_indeed_jobs(last_indeed_page) # 변수를 만들어 주었다.
indeed.py 수정
def extract_indeed_jobs(last_page):
jobs = [] # 리스트를 지정해주었다.
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
print(result.status_code)
return jobs #리스트에 담는다.
<출력사항 없음>
어떻게 데이터를 html에서 추출해낼 수 있을까
'Language > 파이썬' 카테고리의 다른 글
Building a Job Scrapper(4) (0) | 2020.01.04 |
---|---|
Building a Job Scrapper(3) (0) | 2020.01.04 |
Building a Job Scrapper (0) | 2020.01.02 |
Data Analysis/데이터로 그래프그리기 (0) | 2018.01.05 |
Data Analysis/웹페이지 데이터가져오기 (0) | 2018.01.05 |