This project was bulit as a part of Residency Cohort with Women in Data. The project is developed for City of Rancho Cordova. The purpose of this project is to provide insights that will help the Economic Department of the city to build a skilled workforce that spurs business competitiveness and economic growth.
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
from random import randint
from time import sleep
headers = {'user-agent': 'Mozilla/5.0'} #to mimic a browser
def get_data(url):
#get data from url
try:
page = requests.get(url,headers=headers) # dont forget header who will be thrown out as bot
bsobj = soup(page.content, "html.parser")
return bsobj
except URLError as url_error:
print("url error")
return None
except HTTPError as http_error:
print(http_error)
def get_next_url(npages):
# get the url link for next pages
url="https://www.glassdoor.com"
url= url+"/Job/rancho-cordova-jobs-SRCH_IL.0,14_IC1147221.htm?includeNoSalaryJobs=true&p=" +str(npages)
def get_company_name(bsobj):
#get company name from th page
job_elements = bsobj.find_all("a", class_ ="job-search-key-l2wjgv e1n63ojh0 jobLink")
if job_elements != []:
#print(job_elements)
for job_element in job_elements:
cname=job_element.find("span")
company_name.append(cname.text.strip())
def get_job_title(bsobj):
#get job title
job_titles = bsobj.find_all("a", class_="jobLink job-search-key-1rd3saf eigr9kq1")
if job_titles != []:
for job_title in job_titles:
jtitle=job_title.find("span")
if jtitle != []:
job_title_name.append(jtitle.text.strip())
else :
job_title_name.append('nan')
def get_job_loc(bsobj):
job_locs = bsobj.find_all("div", class_ ="d-flex flex-wrap job-search-key-1m2z0go e1rrn5ka2")
if job_locs != []:
for job_loc in job_locs:
jloc=job_loc.find("span",class_="css-1buaf54 pr-xxsm job-search-key-iii9i8 e1rrn5ka4")
if jloc!=[]:
job_loc_name.append(jloc.text.strip())
else:
job_loc_name.append("nan")
def get_job_age(bsobj):
job_ages = bsobj.find_all("div", class_="d-flex justify-content-between css-pa6dqi")
if job_ages != []:
for job_age in job_ages:
jage=job_age.find("div",class_="d-flex align-items-end pl-std css-17n8uzw")
if jage!=[]:
j_age.append(jage.text.strip())
else:
j_age.append("nan")
def get_job_sal(bsobj):
job_sals = bsobj.find_all("div", class_ ="css-1buaf54 pr-xxsm")
if job_sals != []:
for job_sal in job_sals:
jsal=job_sal.find("span",class_="job-search-key-1hbqxax e1wijj240")
if jsal!=[]:
j_sal.append(jsal.text.strip())
else:
j_sal.append("nan")
def get_industry_name(bsobj):
industry_elements = bsobj.find_all("div", class_= "css-17ituy5 epgue5a3")
if industry_elements != []:
for industry in industry_elements:
iname=industry.find("div",class_="css-13q36n epgue5a0")
industry_name.append(iname.text.strip())
def get_total_pagenumber(bsobj):
pagetext = bsobj.find("div", class_= "cell middle d-none d-md-block py-sm")
#print(pagtext)
pagenumber=int(pagetext.text.split("of")[1].strip())
#print(pagenumber)
return pagenumber
company_name=[]
industry_name=[]
job_title_name=[]
job_loc_name=[]
j_age=[]
#j_sal=[]
url="https://www.glassdoor.com/Job/jobs.htm?locT=C&locId=1147221"
npages=1
numofpages=2
while True:
if npages==numofpages: # stop when npages is equal to total number of pages
break
else :
sleep(randint(1,2))
soup_obj=get_data(url)
get_company_name(soup_obj)
get_job_title(soup_obj)
get_job_loc(soup_obj)
get_job_age(soup_obj)
#get_job_sal(soup_obj)
if npages==1:
numofpages=get_total_pagenumber(soup_obj)
npages=npages+1
get_next_url(npages)
dict_com = {'Company':company_name,'Title':job_title_name,'Location':job_loc_name,'Jobage':j_age}
df_company=pd.DataFrame(dict_com)
df_company.head(20)
Company | Title | Location | Jobage | |
---|---|---|---|---|
0 | FedEx Ground PH US | Warehouse Package Handler | Sacramento, CA | 7d |
1 | Behavior Frontiers | Entry Level Administrative Assistant | Sacramento, CA | 11d |
2 | Amazon Workforce Staffing | Package Sorter - Immediate Hire $3,000 Sign On... | Rancho Cordova, CA | 29d |
3 | Amazon Workforce Staffing | Amazon Package Sorter - $3,000 Sign On Bonus! | Sacramento, CA | 29d |
4 | Amazon Workforce Staffing | Warehouse Worker - Immediate Hire | Galt, CA | 21d |
5 | UPS | Part-time Air Ramp Loader/Unloader Hiring Event | Mather, CA | 4d |
6 | Amazon Workforce Staffing | Amazon Fulfillment Center Warehouse Associate | Rancho Cordova, CA | 30d+ |
7 | California Farm Bureau Federation | Videographer/Editor | Sacramento, CA | 30d+ |
8 | Amazon Workforce Staffing | Warehouse Worker - Urgent Hire $3,000 Sign On ... | Elk Grove, CA | 29d |
9 | Amazon Workforce Staffing | Warehouse Worker - Amazon Hiring Now! | Rancho Cordova, CA | 30d+ |
10 | Pacific Coast Supply | Branch Manager | Rancho Cordova, CA | 10d |
11 | Amazon Workforce Staffing | Picker Packer - Amazon Hiring Now! | Davis, CA | 30d+ |
12 | Amazon Workforce Staffing | Picker Packer - Urgent Hire | Rancho Cordova, CA | 30d+ |
13 | ArborWorks, Inc | Journeyman Tree Trimmer / Foreman | California | 30d+ |
14 | Federal Bureau of Investigation | Special Agent | Sacramento, CA | 30d+ |
15 | City of Sacramento | Chief of Police | Sacramento, CA | 9d |
16 | Residential Property Owner | On-Site Property Manager | Sacramento, CA | 9d |
17 | The Travelers Companies, Inc. | Claim Operations Specialist | Rancho Cordova, CA | 24h |
18 | The Sierra Medical Partnership | Surgery Scheduler | Folsom, CA | 15d |
19 | The Travelers Companies, Inc. | Senior Workers' Compensation Claim Representative | Rancho Cordova, CA | 24h |
df_company.shape
(870, 4)
df_company.to_csv("glassdoor_door_job_listing_10_02_2021.csv")