This program checks
the differences between the favorite blogs and reader blogs
of an user of Ameblo, one of the most famous Japanese blog site.
Set the user in the following line:
ameblo_user = 'XXXXXXXXXXXXXX'
This script retrieves
links only from the favorite blogs and reader blogs
that the specified user has made public.
It does not retrieve private information or require login credentials.
To check for differences, it utilizes set (collection) difference checking.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import redef get_html_content(url):
html_content = ''
try:
print('fetching url', url)
q = Request(url)
html = urlopen(q, timeout=15)temp = html.read()
detect_enc = chardet.detect(temp)['encoding']
if detect_enc is None:
detect_enc = 'utf-8'
elif detect_enc == 'Windows-1254':
detect_enc = 'utf-8'
html_content = temp.decode(detect_enc, errors='ignore')except Exception as e:
print('fetching url failed', url, repr(e))return html_content
def get_site_links(site_url):
total_links = [ ]
links_backup = [ ]
for page_no in range(1, 1000):
site = site_url.replace('.html', '-' + str(page_no) + '.html')
print(site)
html_content = get_html_content(site)
if not html_content:
break
else:
soup = BeautifulSoup(html_content, 'html.parser')
#get_text = soup.get_text()
#print(get_text)
links = [ ]
for link in soup.find_all('a'):
href = link.get('href')
text = link.get_text()
if href:
absolute_link = urljoin(base_url, href)
if my_site in absolute_link:
continue
if base_url not in absolute_link:
continue
text = re.sub('更新$', '', text)
links.append([absolute_link, text])if links_backup == links:
return total_links
links_backup = links#for link in links:
# print(link)total_links.extend(links)
time.sleep(1)
# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX'ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
site_url = my_site + page
rets = get_site_links(site_url)links = [ret[0] for ret in rets]
total_links.append(links)names = [ret[1] for ret in rets]
total_names.append(names)subs = [(0, 1), (1, 0)]
for sub in subs:
sub0 = sub[0]
sub1 = sub[1]
diff_list = set(total_names[sub0]) - set(total_names[sub1])
print(len(diff_list))
for elem in diff_list:
print(elem)