import pandas as pd
import numpy as np
import sqlite3
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import time
from random import randint
from time import sleep
import warnings
warnings.filterwarnings('ignore')
Topmanga = requests.get("https://myanimelist.net/topmanga.php?limit=0")
Topmanga_tree = BeautifulSoup(Topmanga.content, "html.parser") #We have to do this for each page
Pagenbr = np.arange(0, 56450, 50).tolist() #so we create a list to go to each page
Pagelist = []
for i in Pagenbr:
Page = "https://myanimelist.net/topmanga.php?limit=" + str(i) +""
Pagelist.append(Page)
#Let's create a function that wi will apply to each page to retrieve every link.
#Linklist = []
def link_retriever(page): #here the page must be a str
Page = requests.get(page)
Page_tree = BeautifulSoup(Page.content, "html.parser")
for manga in Page_tree.find_all('h3'):
for link in manga.find_all('a'):
if 'manga' in link.get('href'):
#print(link.get('href'))
if link.get('href') not in Linklist:
Linklist.append(link.get('href'))
for i in Pagelist:
print(i)
print(len(Linklist))
link_retriever(i)
pprint(len(Linklist))
56407
def anydup(thelist):
seen = set()
for x in thelist:
if x in seen: return True
seen.add(x)
return False
df = pd.DataFrame(Linklist, columns = ['Webpage'])
#del df
df[['Name', 'Status', 'Type', 'Rating', 'Raters', 'Members', 'Started', 'Ended', 'Demographic', 'Serialization', 'Chapters']] = pd.DataFrame([['', '', '', '', '', '', '', '', '', '', '',]], index=df.index)
df
| Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://myanimelist.net/manga/2/Berserk | |||||||||||
| 1 | https://myanimelist.net/manga/1706/JoJo_no_Kim... | |||||||||||
| 2 | https://myanimelist.net/manga/13/One_Piece | |||||||||||
| 3 | https://myanimelist.net/manga/656/Vagabond | |||||||||||
| 4 | https://myanimelist.net/manga/1/Monster | |||||||||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 56402 | https://myanimelist.net/manga/88606/Triple_Ojo... | |||||||||||
| 56403 | https://myanimelist.net/manga/88607/Ore_ga♀de_... | |||||||||||
| 56404 | https://myanimelist.net/manga/88608/Ojousama_t... | |||||||||||
| 56405 | https://myanimelist.net/manga/88609/Neratta_On... | |||||||||||
| 56406 | https://myanimelist.net/manga/88610/Mazomai__M... |
56407 rows × 12 columns
df.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPages.csv', index = False)
#https://myanimelist.net/manga/2/Berserk
Mangainfo = requests.get('https://myanimelist.net/manga/2/Berserk')
Mangainfo_tree = BeautifulSoup(Mangainfo.content, "html.parser")
infolist = []
infolistclean = []
for info in Mangainfo_tree.find_all('div', {"class": "spaceit_pad"}):
infolist.append(info.get_text().strip())
for i in infolist:
splitter = ":"
if splitter in i:
splitlist = i.split(splitter, 1)
i = splitlist[1].strip()
infolistclean.append(i)
#pprint(infolistclean)
['Berserk', 'Berserk: The Prototype', 'ベルセルク', 'Manga', '41', '380', 'On Hiatus', 'Aug 25, 1989 to Sep 10, 2021', 'Action\n' '\n' ' Action, Adventure\n' '\n' ' Adventure, Drama\n' '\n' ' Drama, Fantasy\n' '\n' ' Fantasy, Horror\n' '\n' ' Horror, Supernatural\n' '\n' ' Supernatural', 'Demons\n' '\n' ' Demons, Military\n' '\n' ' Military, Psychological\n' '\n' ' Psychological', 'Seinen\n\n Seinen', 'Young Animal', 'Miura, Kentarou (Story & Art)', '9.411 (scored by 214699214,699 users)1 indicates a weighted score.', "#12 2 based on the top manga page. Please note that 'R18+' titles are " 'excluded.', '#2', '452,586', '84,928']
dffilled = pd.read_csv('MangaPages.csv')
columnstr = ['Name', 'Status', 'Type', 'Started', 'Demographic', 'Serialization']
for column in columnstr:
dffilled[column]=dffilled[column].astype(str)
def infofiller(row):
Mangainfo = requests.get(dffilled.at[row, 'Webpage'])
Mangainfo_tree = BeautifulSoup(Mangainfo.content, "html.parser")
infolist = []
dictclean = {}
for info in Mangainfo_tree.find_all('div', {"class": "spaceit_pad"}):
infolist.append(info.get_text().strip())
for i in infolist:
splitter = ":"
if splitter in i:
dictclean[(str(i.split(splitter, 1)[0]))] = (i.split(splitter, 1)[1]).strip()
if 'English' in dictclean.keys():
dffilled.at[row, 'Name'] = dictclean['English']
elif 'Synonyms' in dictclean.keys():
dffilled.at[row, 'Name'] = dictclean['Synonyms']
elif 'Japanese' in dictclean.keys():
dffilled.at[row, 'Name'] = dictclean['Japanese']
else:
dffilled.at[row, 'Name'] = np.nan
if 'Status' in dictclean.keys():
dffilled.at[row, 'Status'] = dictclean['Status']
else:
dffilled.at[row, 'Status'] = np.nan
dffilled.at[row, 'Type'] = dictclean['Type']
if 'N/A' in dictclean['Score']:
dffilled.at[row, 'Rating'] = np.nan
else:
dffilled.at[row, 'Rating'] = float(dictclean['Score'].split(' ', 1)[0].split('1', 1)[0])
raters = dictclean['Score'].split(' ', 4)[3].replace(",","")
raters = int(raters[:len(raters)//2])
dffilled.at[row, 'Raters'] = raters
members = int(dictclean['Members'].replace(",",""))
dffilled.at[row, 'Members'] = members
if dictclean['Published'] == 'Not available':
dffilled.at[row, 'Started'] = np.nan
dffilled.at[row, 'Ended'] = np.nan
elif dictclean['Published'] == 'N/A':
dffilled.at[row, 'Started'] = np.nan
dffilled.at[row, 'Ended'] = np.nan
else:
startdate = dictclean['Published'].split('to', 1)[0].replace(",","").strip().replace(" ","",1)
if len(startdate) < 5:
startdate2 = datetime.strptime(startdate, '%Y').date()
dffilled.at[row, 'Started'] = startdate2
elif len(startdate) < 9:
startdate2 = datetime.strptime(startdate, '%b %Y').date()
dffilled.at[row, 'Started'] = startdate2
else:
startdate2 = datetime.strptime(startdate, '%b %d %Y').date()
dffilled.at[row, 'Started'] = startdate2
if 'to' in dictclean['Published']:
enddate = dictclean['Published'].split('to', 1)[1].replace(",","").strip().replace(" ","",1)
else:
enddate = '?'
if enddate != '?':
if len(enddate) < 5:
enddate2 = datetime.strptime(enddate, '%Y').date()
dffilled.at[row, 'Ended'] = enddate2
elif len(enddate) < 9:
enddate2 = datetime.strptime(enddate, '%b %Y').date()
dffilled.at[row, 'Ended'] = enddate2
else:
enddate2 = datetime.strptime(enddate, '%b %d %Y').date()
dffilled.at[row, 'Ended'] = enddate2
else:
dffilled.at[row, 'Ended'] = np.nan
if 'Demographics' in dictclean.keys():
Dem = dictclean['Demographics'].split('\n', 1)[0]
dffilled.at[row, 'Demographic'] = Dem
elif 'Demographic' in dictclean.keys():
Dem = dictclean['Demographic'].split('\n', 1)[0]
dffilled.at[row, 'Demographic'] = Dem
else:
dffilled.at[row, 'Demographic'] = np.nan
dffilled.at[row, 'Serialization'] = dictclean['Serialization'].strip()
if dictclean['Chapters'] != 'Unknown':
dffilled.at[row, 'Chapters'] = int(dictclean['Chapters'].strip())
else:
dffilled.at[row, 'Chapters'] = np.nan
#indexlist1 = np.arange(8807, 26407, 40).tolist()
indexlist2 = np.arange(26406, 56407, 40).tolist()
errorlist = []
dffilled = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')
dffilled['Name'] = dffilled['Name'].astype(str)
dffilled['Status'] = dffilled['Status'].astype(str)
counter = 0
for index in indexlist2:
print(index)
indexstart = index - 40
for i, row in dffilled[indexstart:index].iterrows():
if dffilled.at[i, 'Name'] == 'nan':
try:
infofiller(i)
except (Exception) as e:
print('Error at index {}: {!r}'.format(i, row))
print(e)
errorlist.append(i)
time.sleep(180)
if counter % 10 == 0:
time.sleep(randint(10,35))
if counter % 1000 == 0:
print('we are at the' + str(i) + 'th value')
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp2.csv', index = False)
counter =+ 1
time.sleep(randint(30,70))
pd.set_option("display.max_rows", 30)
infofiller(9431)
dffilled[9430:9431]
print(len(errorlist))
print(errorlist)
textfile = open("a_file.txt", "w")
for element in errorlist:
textfile.write(str(element) + "\n")
textfile.close()
129 [26458, 27606, 27847, 28726, 29246, 29646, 30086, 30446, 31046, 31206, 31406, 31606, 31926, 32206, 33291, 33686, 34606, 35166, 35646, 35886, 37966, 38449, 38614, 38772, 39206, 39366, 39526, 39561, 39766, 39926, 40086, 40236, 40486, 40646, 40808, 41046, 41326, 41566, 41726, 41966, 42126, 42286, 42566, 42751, 43007, 43206, 43368, 43526, 43686, 43846, 44007, 44166, 44366, 44394, 44590, 44598, 44605, 44687, 44846, 45006, 45166, 45326, 45414, 45568, 45726, 45886, 46086, 46246, 46406, 46567, 46726, 46867, 47047, 47215, 47366, 47442, 47606, 47766, 47926, 48086, 48246, 48359, 48686, 48846, 49006, 49166, 49366, 49526, 49686, 49926, 50046, 50206, 50277, 50278, 50446, 50615, 50755, 50877, 51046, 51206, 51406, 51566, 51726, 51926, 52086, 52262, 52486, 52686, 52846, 53006, 53167, 53446, 53606, 53806, 54006, 54166, 54366, 54526, 54692, 54846, 54853, 55046, 55206, 55366, 55526, 55726, 56047, 56206, 56363]
print(len(errorlist))
errorlist2 = []
counter2 = 0
for i in errorlist:
print(i)
if dffilled.at[i, 'Name'] == 'nan':
try:
infofiller(i)
except (Exception) as e:
print('Error at index {}: {!r}'.format(i, row))
print(e)
errorlist2.append(i)
time.sleep(180)
if counter % 10 == 0:
time.sleep(randint(10,35))
if counter % 1000 == 0:
print('we are at the' + i + 'th value')
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv', index = False)
counter2 =+ 1
if counter%20 == 0:
time.sleep(randint())
print('+ 20 ezclap')
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv', index = False)
#dffilled = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')
dffilled['Name'] = dffilled['Name'].astype(str)
dffilled['Status'] = dffilled['Status'].astype(str)
nameerrors = dffilled[(dffilled['Name'] == 'nan') & (dffilled['Status'] != 'nan')]
nameerrors
| Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3052 | https://myanimelist.net/manga/7071/B-Eyes | nan | Finished | Manga | 7.43 | 425.0 | 865.0 | 2000-01-01 | NaN | Shoujo | None | 17.0 |
| 3170 | https://myanimelist.net/manga/13070/Wait_Wolf | nan | Finished | Manhwa | 7.42 | 460.0 | 1412.0 | NaN | NaN | NaN | None | NaN |
| 3488 | https://myanimelist.net/manga/4795/Rurouni_Ken... | nan | Finished | Light Novel | 7.38 | 437.0 | 1399.0 | 1996-10-01 | NaN | Shounen | None | 19.0 |
| 3580 | https://myanimelist.net/manga/3070/Shokugyou_T... | nan | Finished | Manga | 7.37 | 421.0 | 804.0 | 2003-01-01 | NaN | NaN | None | 4.0 |
| 3721 | https://myanimelist.net/manga/4657/Winter_Flowers | nan | Finished | One-shot | 7.35 | 1500.0 | 2520.0 | 2007-01-01 | NaN | Shoujo | Cheese! | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48786 | https://myanimelist.net/manga/43335/Brother_X_... | nan | Finished | One-shot | NaN | NaN | 45.0 | NaN | NaN | NaN | None | 1.0 |
| 48876 | https://myanimelist.net/manga/45193/Hideout_ZIN | nan | Finished | One-shot | NaN | NaN | 192.0 | NaN | NaN | NaN | None | 1.0 |
| 48878 | https://myanimelist.net/manga/45227/Sweet_Drug | nan | Finished | One-shot | NaN | NaN | 130.0 | NaN | NaN | NaN | None | 1.0 |
| 48909 | https://myanimelist.net/manga/45615/Takatoh-sa... | nan | Finished | Manga | NaN | NaN | 83.0 | NaN | NaN | NaN | Magazine Be x Boy | 8.0 |
| 52587 | https://myanimelist.net/manga/59381/Switch_dj_... | nan | Finished | Doujinshi | NaN | NaN | 273.0 | 2009-08-16 | NaN | NaN | None | 1.0 |
604 rows × 12 columns
print(len(errorlist3))
print(errorlist3)
fullerrors2 = dffilled[(dffilled['Name'] == 'nan') & (dffilled['Status'] == 'nan')]
fullerrors2
169 [5739, 5908, 6078, 6254, 6278, 6448, 6625, 6626, 9497, 9667, 9836, 10011, 10029, 10184, 10343, 10344, 18366, 18532, 18694, 18695, 18869, 19041, 19213, 19214, 19379, 19568, 19741, 19896, 20078, 20263, 20436, 20597, 20744, 20904, 21045, 21214, 21215, 21373, 21547, 21722, 21893, 21894, 22071, 22245, 22418, 22580, 22581, 22749, 22918, 23078, 23079, 23251, 23433, 23591, 23592, 23765, 23925, 23926, 24102, 24276, 24334, 24504, 24505, 24678, 24800, 24957, 25014, 25174, 25175, 25345, 25493, 25494, 25505, 25669, 25843, 25844, 26006, 26007, 26169, 26334, 26335, 50046, 50277, 50278, 50615, 50755, 50877, 56363, 5580, 5737, 5900, 6062, 6231, 6278, 6451, 6615, 9476, 9477, 9645, 9799, 9959, 10029, 10202, 13367, 18223, 18397, 18398, 18570, 18738, 18928, 19090, 19091, 19251, 19425, 19609, 19610, 19783, 19938, 20109, 20292, 20367, 20537, 20538, 20719, 20898, 21059, 21236, 21395, 21564, 21727, 21889, 21890, 22002, 22169, 22335, 22501, 22660, 22818, 22992, 23154, 23310, 23453, 23618, 23777, 23930, 24085, 24247, 24334, 24498, 24670, 24800, 24962, 25014, 25185, 25340, 25480, 25505, 25665, 25819, 25998, 26148, 26318, 50046, 50277, 50278, 50615, 50755, 50877, 56363]
| Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20367 | https://myanimelist.net/manga/121902/Hiiragi-s... | nan | nan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 20538 | https://myanimelist.net/manga/123086/Egao_no_T... | nan | nan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 22002 | https://myanimelist.net/manga/132258/Botsuichi | nan | nan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
dffilled2 = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')
dffilled2['Name'] = dffilled2['Name'].astype(str)
dffilled2['Status'] = dffilled2['Status'].astype(str)
fullerrors3 = dffilled2[(dffilled['Name'] == 'nan') & (dffilled2['Status'] == 'nan')]
lasterrorslist = [20367, 20538, 22002]
counter2 = 0
for ind in lasterrorslist:
print(ind)
if dffilled.at[i, 'Name'] == 'nan':
print(i)
try:
infofiller(i)
except (Exception) as e:
print(e)
#errorlist4.append(ind)
time.sleep(180)
if counter2 % 10 == 0:
time.sleep(randint(10,35))
if counter2 % 1000 == 0:
print('we are at the' + str(ind) + 'th value')
counter2 =+ 1
if counter2%20 == 0:
time.sleep(randint())
print('+ 20 ezclap')
20367 20538 22002
infofiller(20367)
infofiller(20538)
infofiller(22002)
dffilled.loc[[20367, 20538, 22002]]
| Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20367 | https://myanimelist.net/manga/121902/Hiiragi-s... | Everyday in a Vampire Family | Finished | One-shot | 6.68 | 145.0 | 387.0 | 2019-09-11 | NaN | Shounen | Shounen Magazine (Weekly) | 1.0 |
| 20538 | https://myanimelist.net/manga/123086/Egao_no_T... | A Workplace Where You Can't Help But Smile | Publishing | Manga | NaN | NaN | 307.0 | 2019-05-18 | NaN | NaN | Comic Days | NaN |
| 22002 | https://myanimelist.net/manga/132258/Botsuichi | 没イチ | Publishing | Manga | NaN | NaN | 16.0 | 2020-12-08 | NaN | Seinen | Evening | NaN |
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp3.csv', index = False)