Share |

Friday, June 24, 2016

How to scrape data from website using selenium in python?

Scootsy Crawler / Scrapper

WARNING - Contents of this page are for educational purposes only. It is strongly suggested that you do not use this knowledge for illegal purposes!

Below are the code of statement to scrape or extract information from website.
  • Library used in the codes are sys, Selenium, pandas and datetime
  • web driver Firefox. you can used chrome web driver instead.

python code view

Python-IDLE-Scootsy-crawler-code

Scrapped Data view

Scootsy scrapped data

Scootsy Scrapper Code

click here to view project online

import sys
import selenium
import selenium.webdriver.common.keys
import pandas
import datetime

URL = 'https://scootsy.com/get_vendor_listing.php'
#*************************************************************************************************
maxcharLen = 60
print (maxcharLen + 1) * '-'
strToPrint = 'Scootsy Crawler 1.0'
print (maxcharLen  - len(strToPrint))/2 * '*', strToPrint , (maxcharLen  - len(strToPrint))/2 * '*'
strToPrint = "Please don't change the code "
print (maxcharLen  - len(strToPrint))/2 * '*' , strToPrint , (maxcharLen  - len(strToPrint))/2 * '*'
strToPrint = "If you change it will leads to raise an error"
print (maxcharLen  - len(strToPrint))/2 * '*' , strToPrint , (maxcharLen  - len(strToPrint))/2 * '*'
print (maxcharLen + 1) * '-'
#*************************************************************************************************

sys.stdout.write("\r" + "Driver Initializing ...")
#driver = selenium.webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chrome.exe")
driver = selenium.webdriver.Firefox()
sys.stdout.write("\r" + "Navigating to url : " + URL )
driver.get(URL)
#assert "tabs-container2" in driver.find_element_by_id('tabs-container2')

ul = driver.find_element_by_tag_name("ul")
sys.stdout.write("\r" + "start crawling for Restaurant : " + URL )
columns = ['Id','City','Restaurant Name','Cuisines','Delivery Time','URL']

restaurants = [[]]
i = 0

print "\r" + "found available restaurant : ", len(ul.find_elements_by_tag_name("li"))
for li in ul.find_elements_by_tag_name("li"):

    div = li.find_element_by_class_name('dish_name')
    a = div.find_element_by_tag_name('a')
    span = div.find_element_by_tag_name('span')

    resid = li.get_attribute('id')
    resName = a.text
    cousin = span.text
    deliverytime = li.find_element_by_class_name('icn').find_element_by_tag_name('span').text
    link = a.get_attribute('href')
    city = link.split('/',3)[3].split('-')[3]

    restaurants[i].append(resid)
    restaurants[i].append(city)
    restaurants[i].append(resName)
    restaurants[i].append(cousin)
    restaurants[i].append(deliverytime)
    restaurants[i].append(link)

    sys.stdout.write("\r" + "Data Extracted For Restaurant : " + resName)

    restaurants.append([])

    i += 1

sys.stdout.write("\r" + "Data Extraction Finished...")
driver.close()
sys.stdout.write("\r" + "Driver Close...")
data_Table = pandas.DataFrame.from_records(restaurants,columns=columns)

filename = "scootsy crawl " + str(datetime.datetime.today().strftime("%d%m%y %H%M%S")) + ".xlsx"
sys.stdout.write("\r" + "File Saving TO : " + filename)
data_Table.to_excel(filename,'Sccotsy',index=False,encoding="UTF-8")
print "\r" + "File Saved @ : ", filename
sys.stdout.read()

4 comments:

© 2016 All Rights Reserved | DMCA.com Protection Status