アメブロユーザの
お気に入りブログと読者ブログの差分を
確認するプログラム。
以下の行にユーザを設定する。
ameblo_user = 'XXXXXXXXXXXXXX'
このスクリプトは、
指定されたユーザーが公開している、
お気に入りブログと読者ブログのリンクのみを取得する。
非公開の情報やログインが必要な情報は取得しない。
差分確認には、
セット(集合)の差分チェックを使っている。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import redef get_html_content(url):
html_content = ''
try:
print('fetching url', url)
q = Request(url)
html = urlopen(q, timeout=15)temp = html.read()
detect_enc = chardet.detect(temp)['encoding']
if detect_enc is None:
detect_enc = 'utf-8'
elif detect_enc == 'Windows-1254':
detect_enc = 'utf-8'
html_content = temp.decode(detect_enc, errors='ignore')except Exception as e:
print('fetching url failed', url, repr(e))return html_content
def get_site_links(site_url):
total_links = [ ]
links_backup = [ ]
for page_no in range(1, 1000):
site = site_url.replace('.html', '-' + str(page_no) + '.html')
print(site)
html_content = get_html_content(site)
if not html_content:
break
else:
soup = BeautifulSoup(html_content, 'html.parser')
#get_text = soup.get_text()
#print(get_text)
links = [ ]
for link in soup.find_all('a'):
href = link.get('href')
text = link.get_text()
if href:
absolute_link = urljoin(base_url, href)
if my_site in absolute_link:
continue
if base_url not in absolute_link:
continue
text = re.sub('更新$', '', text)
links.append([absolute_link, text])if links_backup == links:
return total_links
links_backup = links#for link in links:
# print(link)total_links.extend(links)
time.sleep(1)
# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX'ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
site_url = my_site + page
rets = get_site_links(site_url)links = [ret[0] for ret in rets]
total_links.append(links)names = [ret[1] for ret in rets]
total_names.append(names)subs = [(0, 1), (1, 0)]
for sub in subs:
sub0 = sub[0]
sub1 = sub[1]
diff_list = set(total_names[sub0]) - set(total_names[sub1])
print(len(diff_list))
for elem in diff_list:
print(elem)