Scraping Websites

Tue Nov 13, 2018 · 1083 words

The following code scrapes a list of websites using Python. Most of the explanations are provided within the code as comments.

We first import a few packages to assist with scraping, and set the working directory.

#Import Packages
import requests
from bs4 import BeautifulSoup
import lxml
import os
import re
import pandas as pd
import numpy as np
from urllib.parse import urljoin

#Set Working Directory
os.getcwd()
os.chdir("Your/File/Path/Here")

Once the environment is set up, I wrote a variety of functions to conduct different type of scrapes. Doing it this way will allow me to create more flexible scrapes in the future, and also allows me to avoid repeating the same command twice.

#Define functions to scrape website

def scrape_prep(link, home):
    """This function prepares a URL for further processing.  You provide two arguments:
        link=the link you want to scrape and prepare
        home=for use with sub-menus.  if the link you provide is a menu link (so, '/Services' instead of a full site)
                you put the URL where the menu originates here ('www.awebsite.com').  
                If link is a regular site, place an empty string here."""
    #Prepare the url
    olink=link
    nhome=home
    
    match = re.search(r"http", home)
    if match==None:
        nhome=("http://"+home)  
    
    www = re.search(r"www.", link)
    if www==None:
        olink = urljoin(nhome,link)
        
    match = re.search(r"http", olink)
    if match==None:
        olink=("http://"+link)
    #print(olink) #turn this on to troubleshoot
    #Import the text
    try:
        r = requests.get(olink)
    except:
        print(olink, "Error!")
        return ""
        
    html_doc = r.text
    
    #Turn it into soup
    soup = BeautifulSoup(html_doc, 'lxml')
    psoup = BeautifulSoup.prettify(soup)
    
    #Return the edited document
    return soup

def scrape_main(soup):
    """This returns all of the text in the provided soup, without cleaning or editing.  Will include menu items."""  
    alltxt = soup.get_text()
    return alltxt

def scrape_menu(soup, types):
    """This returns all menu items.  Depending on the website, it may include sub-menu items.
    soup=soup of website you want
    types= what type of return you want:
        'link' returns menu links
        'text' returns the menu names
        'dict' returns a dictionary where the menu names are keys and the URLs are values
    Can be told to grab the text, links, or a dictionary that contains both."""
    #Initialize lists
    menu = []
    menlk = []
    #Loop over the items and fill lists with names and URLs.
    for tag in soup.find_all("li"):
        #Get text
        for a in tag.find_all("a"):
            tagname=[]
            for name in a.text.split('\n'):
                if len(name) > 0:
                    menu.append(name.strip())
            taglen=len(tagname)
            #print("Tags:", taglen)
            #fulltag = " ".join(tagname)
            #menu.append(tagname)
            #Get links
            menu_links=tag.find_all("a")
            lks = []
            for link in menu_links:
                lks.append(link.get("href"))
            #print("Links: ",len(lks))
            if lks!=[]:
                menlk.append(lks[0])
            else:
                menlk.append("")
    
    #Return the appropriate information to user, based on string "types"
    if types=="link":
        return menlk
    if types=="text":
        return menu
    if types=="dict":
        key=menu
        values=menlk
        dictionary = dict(zip(key, values))
        return dictionary


    
def skinny_scrape(soup):
    """This scrapes all tagged 'paragraphs' from the website.  May miss some information.
    Provide soup (which can be generated by scrape_prep)""" 
    parags=[]
    for tag in soup.find_all("p"):
        for name in tag.text.split('\n'):
            if len(name) > 0:
                parags.append(name.strip())
    return parags


def scrape_links(soup):
    """Give this function the soup and it will return all links from the site as a list"""
    links=[]
    for lk in soup.find_all("a"):
        link = lk.find('href')
        links.append(link)
    return links
        
    

def scrape_select(dic, term, types, home):
    """This function pulls from the menu, opening the link associated with a term and getting requested contents:
        menu links and names, paragraph content
        It accepts four arguments:
        dic=dictionary which should include tab names and links (note that scrape_menu can provide this)
        term=the term you want to find in the menu
        types=the type of return you want, which accepts:
            tabname (you want the name of the tab that contained the search term)
            parags (you want the plain text from the page)
            lks (you want the links)
        home=the home URL for the site (as string)
            """
    soup=""
    tabname=""
    for each in dic:
        match = re.search(term, each)
        if match!=None:
            if types=="tabname":
                tabname=each
                return tabname
            else:
                soup = scrape_prep(dic[each], home)
    if soup=="":
        #print("No matches")
        return ""
    if types=="parags":
        para=skinny_scrape(soup)
        return para
    else:
        lks=scrape_menu(soup, types)
        return lks
                   

def scrape_find(dic: dict, term: str, types:str):
    """This function identifies all matching cases in provided menu, and returns a list.
        Arguments include:
            -dic (your dictionary)
            -term (what you want to search for)
            -types (what type of output you want)
                -types can be either 'text' or 'link'"""
    matches=[]
    for each in dic:
        match=re.search(term, each)
        if match!=None:
            if types=="text":
                matches.append(each)
            if types=="link":
                matches.append(dic[each])
                
    return matches

Once everything is ready to go, we need to actually read in the dataframe. My dataframe has a variety of hospital characteristics, but for the purposes of this scrape, I only use the “url”. I read it in, and then initialize variables that I will use when I scrape the site.

#Import Dataset
##CSV file with URLs and other hospital data, read in as a pandas dataframe
georgia = pd.read_csv("appended2.csv")

#Create a list object of just the URLs (in my dataset, the urls are under a column named "url".
urls = georgia.url


#Create new variables to populate (used in next section)
georgia['soup']=np.nan
georgia['menu']= np.nan
georgia['hometext']=np.nan
georgia['bartext']=np.nan
georgia['bartab']=np.nan
georgia['davinci']=np.nan
georgia['datab']=np.nan
georgia['misstab']=np.nan
georgia['misstext']=np.nan

Finally, I use a for loop to scrape every site in the list.

#Scrape Content from all URLs
##This iterates through any list of URLs--I compiled my list through www.ahd.com using a free education account.
###If you want to publish, AHD may provide you free access to their data, even without the EDU account.
i=0 #initialize count
for url in urls:
    #print(url) #Turn on to troubleshoot
    if url is not np.nan:
        #Prep the url for scraping
        urlsoup=scrape_prep(url, "") ##will return a blank string if the URL is broken and print "Error!"
        if urlsoup != "":
            #Get the menu
            menu = scrape_menu(urlsoup, "dict")
            mentxt = [*menu] #changes menu into a list of keys
            georgia.menu[i] = "; ".join(mentxt)

            #Get the text from home page
            text = skinny_scrape(urlsoup)
            georgia.hometext[i] = "; ".join(text)

            #Get the info for bariatric surgery (if it exists)
            barterms = ["Weight Loss", "Weight-Loss", "bariatric", "weightloss", "weight loss", "weight-loss"]
            #Get tab names
            bartab=""
            for term in barterms:
                if bartab=="":
                    bartab = scrape_select(menu, term, "tabname", url)
                    if bartab != "":
                        bartext= scrape_select(menu, term, "parags", url)

            if bartab!="":
                georgia.bartab[i] = bartab
                #Get text information
                georgia.bartext[i] ="; ".join(bartext)

            #Get the Da Vinci info
            daterms = ['Da Vinci', "Robotic Surgery", "Robotic"]
            datab=""
            for term in daterms:
                if datab=="":
                    datab=scrape_select(menu, term, "tabname", url)
                    if datab!="":
                        datext=scrape_select(menu, term, "parags", url)

            if datab!="":
                georgia.datab[i]=datab
                georgia.davinci[i]="; ".join(datext)

        #Get the mission statements
            missterms = ['Mission','Purpose']
            termtab=""
            for term in missterms:
                if termtab=="":
                    termtab=scrape_select(menu, term, "tabname", url)
                    if termtab!="":
                        misstext=scrape_select(menu, term, "parags", url)
            if termtab!="":
                georgia.misstab[i]=termtab
                georgia.misstext[i]="; ".join(misstext)

        
    i=i+1

Once the scraping is done, I check it and then send it to a CSV for analysis in R!

#Export newly created dataset to CSV
georgia.to_csv("georgiatxt2.csv")