Extract all link urls from an HTML document

from bs4 import BeautifulSoup


def extract_urls(path):
    try:
        with open(path) as html_file:
            return [
                link.get('href')
                for link in BeautifulSoup(
                    html_file.read()
                ).find_all('a')
            ]
    except IOError:
        return []

Usage

# Assuming a file called 'example.html', containing:
#
# <html>
#     <body>
#         <a href="http://example.com/1">1</a>
#         <a href="http://example.com/2">2</a>
#         <a href="http://example.com/3">3</a>
#     </body>
# </html>

result = extract_urls('example.html')

print(result)

# yields:
#
# [
#     'http://example.com/1',
#     'http://example.com/2',
#     'http://example.com/3'
# ]

About this Entry

Related Content


Comments