This notebook is used to perform a search in the WHO Photo Library, and then iterate through and save the detail pages to the "html" folder and images to the "img" folder.
This notebook uses selenium and Firefox's Gecko driver. Selenium can be installed with a simple
pip install selenium
but geckodriver must be downloaded from Mozilla's GitHub and added to your machine's PATH.
With selenium set up properly, everything should run correctly, as long as you run the cells one at a time, and make sure that when the browser is supposed to be doing something, it has time to do it. (No running all cells at once, that probably won't work.)
Everything up to "Define some functions" is about navigating to the search page and running the search consistently. Once those functions are defined and the browser is looking at a results page, the cell below "Start iterating" will do the heavy lifting and run the scrape. It might error our occasionally, just back up one result in the browser, then start it up again.
After the scrape, there are a few cells to parse out the data and save it as a CSV. The scrape is saving the HTML of the details page, as well as the large image file, but the data needs to be parsed into a tabular format. You can run all of those cells at once with a Cell --> Run all below with the "Check results" cell active. It will output a csv called "who_image_metadata.csv" with all of the metadata from the details page included.
The actual HTML and image files downloaded will be in the "html" and "img" folders, respectively, and use identifiers from the WHO in their filenames.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException
from random import randint
import re
import time
from subprocess import call
import os
from lxml import html
import pandas as pd
driver = webdriver.Firefox()
driver.get("https://extranet.who.int/photolibrary/")
element = driver.find_element_by_xpath('//a[@id="freeaccess"]')
element.click()
searchbox = driver.find_element_by_xpath('//input[@id="idSearchFullTextQuery"]')
searchbox.clear()
searchbox.send_keys("disability")
search_button = driver.find_element_by_xpath('//a[@class="menuSearchBoxOk"]')
search_button.click()
first_detail = driver.find_element_by_css_selector('.thumbIconLink img')
first_detail.click()
image_id = driver.find_element_by_css_selector('#idFicheImgTd font').text
with open("html/{}_detail.html".format(image_id),'w') as fp:
fp.write(driver.page_source)
def switch_view():
switch_a = driver.find_element_by_xpath('//table[@id="idtblnavigtop"]/tbody/tr[1]/td[3]/a')
view_class = switch_a.get_attribute('name')
switch_img = switch_a.find_element_by_xpath('img')
switch_img.click()
return view_class
def save_detail(image_id=None):
if image_id == None:
image_id = driver.find_element_by_css_selector('#idFicheImgTd font').text
with open("html/{}_detail.html".format(image_id),'w') as fp:
fp.write(driver.page_source)
return image_id
def save_image(image_id=None):
if image_id == None:
image_id = driver.find_element_by_css_selector('.zoomText').text.split('\n')[0]
call(['wget','-O','img/{}_large.jpg'.format(image_id),driver.find_element_by_id('idFicheImg').get_attribute('src')])
return image_id
def click_next():
nextbutton = driver.find_element_by_xpath('//table[@id="idtblnavigtop"]/tbody/tr[1]/td[11]')
if len(nextbutton.find_elements_by_css_selector("*")) > 0:
nextbutton.click()
else:
return None
switch_a = driver.find_element_by_xpath('//table[@id="idtblnavigtop"]/tbody/tr[1]/td[3]/a')
try:
opposite_view_class = switch_a.get_attribute('name')
except StaleElementReferenceException:
switch_a = driver.find_element_by_xpath('//table[@id="idtblnavigtop"]/tbody/tr[1]/td[3]/a')
opposite_view_class = switch_a.get_attribute('name')
return opposite_view_class
def get_page():
switch_a = driver.find_element_by_xpath('//table[@id="idtblnavigtop"]/tbody/tr[1]/td[3]/a')
view_class = switch_a.get_attribute('name')
if view_class == "zoom":
# if the switch view link points to zoom, this is a detail page
save_detail()
else:
# if the switch view link doesn't point to zoom, this is the image page
save_image()
view_class = switch_view()
# switch view returns the name of the view that we're switching to,
# which is now the current page. So if view_class is zoom, we've moved
# to an image page, so we save an image, otherwise, we save a detail page
time.sleep(2)
if view_class == "zoom":
save_image()
else:
save_detail()
return click_next()
Scraping pattern:
keepGoing = True
while keepGoing:
keepGoing = get_page()
time.sleep(1)
len(os.listdir('html'))
len(os.listdir('img'))
filelist = os.listdir("html")
filelist.pop(0)
doc = html.parse('html/WHO_004228_detail.html')
keys = [e.text for e in doc.xpath('//td[@class="list2"]')]
keys.pop(0)
vals = [e.text_content() for e in doc.xpath('//td[@class="list2b"]')]
[x for x in zip(keys,vals)]
dict([x for x in zip(keys,vals)])
metadata = []
for f in filelist:
the_id = f.replace('_detail.html','')
doc = html.parse(os.path.join('html',f))
keys = [e.text for e in doc.xpath('//td[@class="list2"]')]
keys.pop(0)
vals = [e.text_content() for e in doc.xpath('//td[@class="list2b"]')]
meta = dict([x for x in zip(keys,vals)])
meta['id'] = the_id
metadata.append(meta)
df = pd.DataFrame(metadata)
df['html'] = "html/" + df['id'] + "_details.html"
df['img'] = "img/" + df['id'] + "_large.jpg"
df.to_csv('who_image_metadata.csv')