My Tech Life

Memo by a Japanese Software Developer in his late 50s.

An example of web scraping by a browser with Python.

Previously, I posted a web scraping example using urllib,

but it becomes difficult to handle

when there is a session involved.

 

Using Selenium to utilize a browser

makes it easier to handle sessions.

 

In the following sample:

You set URL, XPATH, and search string in the xpath_list.

 

Loop through xpath_list,

get HTML through the browser,

and convert the returned HTML to text using BS4.

 

A common way to obtain XPATH for setting in the list is

to use the web development tool provided with the browser,

specify the desired position,

and copy the XPATH from the copy menu.

 

#!/usr/bin/env python3
# -*- coding: utf8 -*-

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import re
from bs4 import BeautifulSoup

# URL, Xpath, QueryString
xpath_list = [
        ['https://www.yahoo.co.jp', '/html/body/div/div/header/section[1]/div/form/fieldset/span/input', 'Selenium'],
        ['https://dev.to', '/html/body/header/div/div[1]/form/div/div/input', 'Selenium']
]

for elem in xpath_list:
    driver = webdriver.Firefox()
    # URL
    driver.get(elem[0])

    # Xpath
    xpath = elem[1]
    m = driver.find_element(by=By.XPATH, value=xpath)

    # QueryString
    m.send_keys(elem[2])
    m.send_keys(Keys.ENTER)

    html = driver.page_source
    time.sleep(1)

    soup = BeautifulSoup(html, 'html.parser')
    text_source = soup.get_text()
    for l in text_source.split('\n'):
        # skip empty line
        if re.search(r'^\s*$', l):
            continue
        else:
            print(l)

    driver.quit()
    time.sleep(1)

 

 

ブラウザとPythonを連携させてWebスクレイピング例

以前、urllibを使ったWebスクレイピング例を投稿したが、

セッションがあると、扱うのが難しくなってくる。

 

Seleniumを使って、ブラウザを利用すると、

セッションが扱いやすくなる。

 

以下サンプルでは、

xpath_listに、URL、XPATH、検索文字列、を設定する。

xpath_listでループして、ブラウザを通して、HTMLを取得する。

BS4を使って、戻ってきたHTMLをテキストに変換している。

 

リストに設定するXPATHを取得する一般的な方法は、

ブラウザ付属のウェブ開発ツールで、

入力したい位置を指定し、コピーメニューで、XPATHをコピーする。

 

#!/usr/bin/env python3
# -*- coding: utf8 -*-

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import re
from bs4 import BeautifulSoup

# URL, Xpath, QueryString
xpath_list = [
        ['https://www.yahoo.co.jp', '/html/body/div/div/header/section[1]/div/form/fieldset/span/input', 'Selenium'],
        ['https://dev.to', '/html/body/header/div/div[1]/form/div/div/input', 'Selenium']
]

for elem in xpath_list:
    driver = webdriver.Firefox()
    # URL
    driver.get(elem[0])

    # Xpath
    xpath = elem[1]
    m = driver.find_element(by=By.XPATH, value=xpath)

    # QueryString
    m.send_keys(elem[2])
    m.send_keys(Keys.ENTER)

    html = driver.page_source
    time.sleep(1)

    soup = BeautifulSoup(html, 'html.parser')
    text_source = soup.get_text()
    for l in text_source.split('\n'):
        # skip empty line
        if re.search(r'^\s*$', l):
            continue
        else:
            print(l)

    driver.quit()
    time.sleep(1)

 

 

Fetching emails in bulk using Python (More improved version).

I've modularized the previous version of the email fetching script,

focusing solely on message retrieval.

I omitted the decoding and content processing,

leaving those tasks to be handled in the main processing script if needed.

 

I've also added documentation for the module.

 

Save the following code as "mbox_reader.py" and use it:

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Module: mbox_reader.py

This module provides a class for reading messages from an mbox file.

Usage:
    from mbox_reader import MboxReader

    mboxfilename = 'category_social.mbox'
    with MboxReader(mboxfilename) as mbox:
        for i, obj in enumerate(mbox):
            # Your code here

Classes:
    MboxReader(filename)
        A class for reading messages from an mbox file.

"""

import email
from email.policy import default
from email.header import decode_header

class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return self

    def __next__(self):
        lines = [ ]
        while True:
            line = self.handle.readline()
            if not line:
                raise StopIteration
            if line.startswith(b'From '):
                message = email.message_from_bytes(b'\n'.join(lines), policy=default)
                if message:
                    return message
                lines = [ ]
            else:
                lines.append(line.rstrip(b'\r\n'))

if __name__ == "__main__":
    mboxfilename = 'category_social.mbox'
    with MboxReader(mboxfilename) as mbox:
        for i, obj in enumerate(mbox):
            if i > 100: break

            date = obj.get('Date')
            subject = obj.get('Subject')
            print(i, date, subject)

 

 

It can be called as follows:

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
    for i, obj in enumerate(mbox):
        if i > 100: break
        date = obj.get('Date')
        subject = obj.get('Subject')
        print(i, date, subject)

 

pythonでメールを一括取得する(さらに改善版)

前回のメール逐次取得版をモジュール化しました。

 

今回は、メッセージの取得だけに特化し、

デコード処理やコンテンツ処理は省略しました。

それらの処理が必要な場合は、メイン処理側で対応すること。

 

モジュールのドキュメンテーションも追加。

 

以下のコードを「mbox_reader.py」として保存して使用します。

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Module: mbox_reader.py

This module provides a class for reading messages from an mbox file.

Usage:
    from mbox_reader import MboxReader

    mboxfilename = 'category_social.mbox'
    with MboxReader(mboxfilename) as mbox:
        for i, obj in enumerate(mbox):
            # Your code here

Classes:
    MboxReader(filename)
        A class for reading messages from an mbox file.

"""

import email
from email.policy import default
from email.header import decode_header

class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return self

    def __next__(self):
        lines = [ ]
        while True:
            line = self.handle.readline()
            if not line:
                raise StopIteration
            if line.startswith(b'From '):
                message = email.message_from_bytes(b'\n'.join(lines), policy=default)
                if message:
                    return message
                lines = [ ]
            else:
                lines.append(line.rstrip(b'\r\n'))

if __name__ == "__main__":
    mboxfilename = 'category_social.mbox'
    with MboxReader(mboxfilename) as mbox:
        for i, obj in enumerate(mbox):
            if i > 100: break

            date = obj.get('Date')
            subject = obj.get('Subject')
            print(i, date, subject)

 

 

以下のとおり、呼び出し可能です。

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
    for i, obj in enumerate(mbox):
        if i > 100: break
        date = obj.get('Date')
        subject = obj.get('Subject')
        print(i, date, subject)