Python script to check the differences between the followers and followed blogs of a user on a Japanese blog site.
This program checks
the differences between the favorite blogs and reader blogs
of an user of Ameblo, one of the most famous Japanese blog site.
Set the user in the following line:
ameblo_user = 'XXXXXXXXXXXXXX'
This script retrieves
links only from the favorite blogs and reader blogs
that the specified user has made public.
It does not retrieve private information or require login credentials.
To check for differences, it utilizes set (collection) difference checking.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import redef get_html_content(url):
html_content = ''
try:
print('fetching url', url)
q = Request(url)
html = urlopen(q, timeout=15)temp = html.read()
detect_enc = chardet.detect(temp)['encoding']
if detect_enc is None:
detect_enc = 'utf-8'
elif detect_enc == 'Windows-1254':
detect_enc = 'utf-8'
html_content = temp.decode(detect_enc, errors='ignore')except Exception as e:
print('fetching url failed', url, repr(e))return html_content
def get_site_links(site_url):
total_links = [ ]
links_backup = [ ]
for page_no in range(1, 1000):
site = site_url.replace('.html', '-' + str(page_no) + '.html')
print(site)
html_content = get_html_content(site)
if not html_content:
break
else:
soup = BeautifulSoup(html_content, 'html.parser')
#get_text = soup.get_text()
#print(get_text)
links = [ ]
for link in soup.find_all('a'):
href = link.get('href')
text = link.get_text()
if href:
absolute_link = urljoin(base_url, href)
if my_site in absolute_link:
continue
if base_url not in absolute_link:
continue
text = re.sub('更新$', '', text)
links.append([absolute_link, text])if links_backup == links:
return total_links
links_backup = links#for link in links:
# print(link)total_links.extend(links)
time.sleep(1)
# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX'ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
site_url = my_site + page
rets = get_site_links(site_url)links = [ret[0] for ret in rets]
total_links.append(links)names = [ret[1] for ret in rets]
total_names.append(names)subs = [(0, 1), (1, 0)]
for sub in subs:
sub0 = sub[0]
sub1 = sub[1]
diff_list = set(total_names[sub0]) - set(total_names[sub1])
print(len(diff_list))
for elem in diff_list:
print(elem)
アメブロユーザのお気に入りブログと読者ブログの差分を確認するPythonスクリプト
アメブロユーザの
お気に入りブログと読者ブログの差分を
確認するプログラム。
以下の行にユーザを設定する。
ameblo_user = 'XXXXXXXXXXXXXX'
このスクリプトは、
指定されたユーザーが公開している、
お気に入りブログと読者ブログのリンクのみを取得する。
非公開の情報やログインが必要な情報は取得しない。
差分確認には、
セット(集合)の差分チェックを使っている。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import redef get_html_content(url):
html_content = ''
try:
print('fetching url', url)
q = Request(url)
html = urlopen(q, timeout=15)temp = html.read()
detect_enc = chardet.detect(temp)['encoding']
if detect_enc is None:
detect_enc = 'utf-8'
elif detect_enc == 'Windows-1254':
detect_enc = 'utf-8'
html_content = temp.decode(detect_enc, errors='ignore')except Exception as e:
print('fetching url failed', url, repr(e))return html_content
def get_site_links(site_url):
total_links = [ ]
links_backup = [ ]
for page_no in range(1, 1000):
site = site_url.replace('.html', '-' + str(page_no) + '.html')
print(site)
html_content = get_html_content(site)
if not html_content:
break
else:
soup = BeautifulSoup(html_content, 'html.parser')
#get_text = soup.get_text()
#print(get_text)
links = [ ]
for link in soup.find_all('a'):
href = link.get('href')
text = link.get_text()
if href:
absolute_link = urljoin(base_url, href)
if my_site in absolute_link:
continue
if base_url not in absolute_link:
continue
text = re.sub('更新$', '', text)
links.append([absolute_link, text])if links_backup == links:
return total_links
links_backup = links#for link in links:
# print(link)total_links.extend(links)
time.sleep(1)
# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX'ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
site_url = my_site + page
rets = get_site_links(site_url)links = [ret[0] for ret in rets]
total_links.append(links)names = [ret[1] for ret in rets]
total_names.append(names)subs = [(0, 1), (1, 0)]
for sub in subs:
sub0 = sub[0]
sub1 = sub[1]
diff_list = set(total_names[sub0]) - set(total_names[sub1])
print(len(diff_list))
for elem in diff_list:
print(elem)
How to group emails by week into arrays in Python
When conducting data analysis,
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from datetime import datetimedata = [
["Thu, 18 May 2017 02:24:34 +0000","This is mail 1"],
["Thu, 08 Jun 2017 23:13:22 +0000","This is mail 2"],
["Mon, 25 Dec 2017 03:12:33 +0000","This is mail 3"],
["Wed, 13 Sep 2017 22:46:44 -0700","This is mail 4"],
["Wed, 13 Sep 2017 20:11:18 -0700","This is mail 5"],
["Wed, 13 Sep 2017 10:19:19 -0700","This is mail 6"],
["Tue, 12 Sep 2017 22:59:06 -0700","This is mail 7"]
]datetime_dates = [datetime.strptime(datum[0], "%a, %d %b %Y %H:%M:%S %z") for datum in data]
texts = [datum[1] for datum in data]week_groups = { }
for (date, text) in zip(datetime_dates, texts):
date_key = (date.year, date.month, date.isocalendar()[1])
if date_key not in week_groups:
week_groups[date_key] = { }
week_groups[date_key]['dates'] = [ ]
week_groups[date_key]['texts'] = [ ]
week_groups[date_key]['dates'].append(date)
week_groups[date_key]['texts'].append(text)for key in sorted(week_groups.keys()):
print(key)
dates = week_groups[key]['dates']
texts = week_groups[key]['texts']
for (date, text) in zip(dates, texts):
print(date, text)
Here's the result.
(base) C:\pytest>python test6.py
(2017, 5, 20)
2017-05-18 02:24:34+00:00 This is mail 1
(2017, 6, 23)
2017-06-08 23:13:22+00:00 This is mail 2
(2017, 9, 37)
2017-09-13 22:46:44-07:00 This is mail 4
2017-09-13 20:11:18-07:00 This is mail 5
2017-09-13 10:19:19-07:00 This is mail 6
2017-09-12 22:59:06-07:00 This is mail 7
(2017, 12, 52)
2017-12-25 03:12:33+00:00 This is mail 3
Pythonでメールを週ごとに配列にまとめる
データ分析する時に、
どの単位でデータをまとめるか。
メールの一括収集でできるデータ分析を
している最中ですが、
週ごとにメールをまとめるのが
よさそうでした。
これはサンプル。
dataは2次元配列として準備、
1要素目が日付、
2要素目が本文のサンプルです。
準備として、
日付をフォーマットして、配列化。
テキストも別に配列化。
日付から、年の何番目の曜日かを探し、
年月+曜日でキーを作り、ハッシュ化。
ハッシュは、日付とテキストのサブキーを持ち、配列とする。
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from datetime import datetimedata = [
["Thu, 18 May 2017 02:24:34 +0000","This is mail 1"],
["Thu, 08 Jun 2017 23:13:22 +0000","This is mail 2"],
["Mon, 25 Dec 2017 03:12:33 +0000","This is mail 3"],
["Wed, 13 Sep 2017 22:46:44 -0700","This is mail 4"],
["Wed, 13 Sep 2017 20:11:18 -0700","This is mail 5"],
["Wed, 13 Sep 2017 10:19:19 -0700","This is mail 6"],
["Tue, 12 Sep 2017 22:59:06 -0700","This is mail 7"]
]datetime_dates = [datetime.strptime(datum[0], "%a, %d %b %Y %H:%M:%S %z") for datum in data]
texts = [datum[1] for datum in data]week_groups = { }
for (date, text) in zip(datetime_dates, texts):
date_key = (date.year, date.month, date.isocalendar()[1])
if date_key not in week_groups:
week_groups[date_key] = { }
week_groups[date_key]['dates'] = [ ]
week_groups[date_key]['texts'] = [ ]
week_groups[date_key]['dates'].append(date)
week_groups[date_key]['texts'].append(text)for key in sorted(week_groups.keys()):
print(key)
dates = week_groups[key]['dates']
texts = week_groups[key]['texts']
for (date, text) in zip(dates, texts):
print(date, text)
実行結果です。
(base) C:\pytest>python test6.py
(2017, 5, 20)
2017-05-18 02:24:34+00:00 This is mail 1
(2017, 6, 23)
2017-06-08 23:13:22+00:00 This is mail 2
(2017, 9, 37)
2017-09-13 22:46:44-07:00 This is mail 4
2017-09-13 20:11:18-07:00 This is mail 5
2017-09-13 10:19:19-07:00 This is mail 6
2017-09-12 22:59:06-07:00 This is mail 7
(2017, 12, 52)
2017-12-25 03:12:33+00:00 This is mail 3