2024-03-23

An example of web scraping by a browser with Python.

python - English

Previously, I posted a web scraping example using urllib,

but it becomes difficult to handle

when there is a session involved.

Using Selenium to utilize a browser

makes it easier to handle sessions.

In the following sample:

You set URL, XPATH, and search string in the xpath_list.

Loop through xpath_list,

get HTML through the browser,

and convert the returned HTML to text using BS4.

A common way to obtain XPATH for setting in the list is

to use the web development tool provided with the browser,

specify the desired position,

and copy the XPATH from the copy menu.

#!/usr/bin/env python3
# -*- coding: utf8 -*-

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import re
from bs4 import BeautifulSoup

# URL, Xpath, QueryString
xpath_list = [
['https://www.yahoo.co.jp', '/html/body/div/div/header/section[1]/div/form/fieldset/span/input', 'Selenium'],
['https://dev.to', '/html/body/header/div/div[1]/form/div/div/input', 'Selenium']
]

for elem in xpath_list:
driver = webdriver.Firefox()
# URL
driver.get(elem[0])

# Xpath
xpath = elem[1]
m = driver.find_element(by=By.XPATH, value=xpath)

# QueryString
m.send_keys(elem[2])
m.send_keys(Keys.ENTER)

html = driver.page_source
time.sleep(1)

soup = BeautifulSoup(html, 'html.parser')
text_source = soup.get_text()
for l in text_source.split('\n'):
# skip empty line
if re.search(r'^\s*$', l):
continue
else:
print(l)

driver.quit()
time.sleep(1)

2024-03-23

ブラウザとPythonを連携させてWebスクレイピング例

python

以前、urllibを使ったWebスクレイピング例を投稿したが、

セッションがあると、扱うのが難しくなってくる。

Seleniumを使って、ブラウザを利用すると、

セッションが扱いやすくなる。

以下サンプルでは、

xpath_listに、URL、XPATH、検索文字列、を設定する。

xpath_listでループして、ブラウザを通して、HTMLを取得する。

BS4を使って、戻ってきたHTMLをテキストに変換している。

リストに設定するXPATHを取得する一般的な方法は、

ブラウザ付属のウェブ開発ツールで、

入力したい位置を指定し、コピーメニューで、XPATHをコピーする。

#!/usr/bin/env python3
# -*- coding: utf8 -*-

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import re
from bs4 import BeautifulSoup

# URL, Xpath, QueryString
xpath_list = [
['https://www.yahoo.co.jp', '/html/body/div/div/header/section[1]/div/form/fieldset/span/input', 'Selenium'],
['https://dev.to', '/html/body/header/div/div[1]/form/div/div/input', 'Selenium']
]

for elem in xpath_list:
driver = webdriver.Firefox()
# URL
driver.get(elem[0])

# Xpath
xpath = elem[1]
m = driver.find_element(by=By.XPATH, value=xpath)

# QueryString
m.send_keys(elem[2])
m.send_keys(Keys.ENTER)

html = driver.page_source
time.sleep(1)

soup = BeautifulSoup(html, 'html.parser')
text_source = soup.get_text()
for l in text_source.split('\n'):
# skip empty line
if re.search(r'^\s*$', l):
continue
else:
print(l)

driver.quit()
time.sleep(1)

2024-03-20

Fetching emails in bulk using Python (More improved version).

python - English

I've modularized the previous version of the email fetching script,

focusing solely on message retrieval.

I omitted the decoding and content processing,

leaving those tasks to be handled in the main processing script if needed.

I've also added documentation for the module.

Save the following code as "mbox_reader.py" and use it:

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Module: mbox_reader.py

This module provides a class for reading messages from an mbox file.

Usage:
from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
# Your code here

Classes:
MboxReader(filename)
A class for reading messages from an mbox file.

"""

import email
from email.policy import default
from email.header import decode_header

class MboxReader:
def __init__(self, filename):
self.handle = open(filename, 'rb')

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, exc_traceback):
self.handle.close()

def __iter__(self):
return self

def __next__(self):
lines = [ ]
while True:
line = self.handle.readline()
if not line:
raise StopIteration
if line.startswith(b'From '):
message = email.message_from_bytes(b'\n'.join(lines), policy=default)
if message:
return message
lines = [ ]
else:
lines.append(line.rstrip(b'\r\n'))

if __name__ == "__main__":
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break

date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)

It can be called as follows:

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break
date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)

2024-03-20

pythonでメールを一括取得する（さらに改善版）

python

前回のメール逐次取得版をモジュール化しました。

今回は、メッセージの取得だけに特化し、

デコード処理やコンテンツ処理は省略しました。

それらの処理が必要な場合は、メイン処理側で対応すること。

モジュールのドキュメンテーションも追加。

以下のコードを「mbox_reader.py」として保存して使用します。

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Module: mbox_reader.py

This module provides a class for reading messages from an mbox file.

Usage:
from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
# Your code here

Classes:
MboxReader(filename)
A class for reading messages from an mbox file.

"""

import email
from email.policy import default
from email.header import decode_header

class MboxReader:
def __init__(self, filename):
self.handle = open(filename, 'rb')

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, exc_traceback):
self.handle.close()

def __iter__(self):
return self

def __next__(self):
lines = [ ]
while True:
line = self.handle.readline()
if not line:
raise StopIteration
if line.startswith(b'From '):
message = email.message_from_bytes(b'\n'.join(lines), policy=default)
if message:
return message
lines = [ ]
else:
lines.append(line.rstrip(b'\r\n'))

if __name__ == "__main__":
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break

date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)

以下のとおり、呼び出し可能です。

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break
date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)

My Tech Life

Memo by a Japanese Software Developer in his late 50s.

An example of web scraping by a browser with Python.

ブラウザとPythonを連携させてWebスクレイピング例

Fetching emails in bulk using Python (More improved version).

pythonでメールを一括取得する（さらに改善版）