import pandas as pd
import numpy as np
import sqlite3
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import time
from random import randint
from time import sleep
import warnings
warnings.filterwarnings('ignore')
Topmanga = requests.get("https://myanimelist.net/topmanga.php?limit=0")
Topmanga_tree = BeautifulSoup(Topmanga.content, "html.parser") #We have to do this for each page
Pagenbr = np.arange(0, 56450, 50).tolist() #so we create a list to go to each page
Pagelist = []
for i in Pagenbr:
Page = "https://myanimelist.net/topmanga.php?limit=" + str(i) +""
Pagelist.append(Page)
#Let's create a function that wi will apply to each page to retrieve every link.
#Linklist = []
def link_retriever(page): #here the page must be a str
Page = requests.get(page)
Page_tree = BeautifulSoup(Page.content, "html.parser")
for manga in Page_tree.find_all('h3'):
for link in manga.find_all('a'):
if 'manga' in link.get('href'):
#print(link.get('href'))
if link.get('href') not in Linklist:
Linklist.append(link.get('href'))
for i in Pagelist:
print(i)
print(len(Linklist))
link_retriever(i)
pprint(len(Linklist))
56407
def anydup(thelist):
seen = set()
for x in thelist:
if x in seen: return True
seen.add(x)
return False
df = pd.DataFrame(Linklist, columns = ['Webpage'])
#del df
df[['Name', 'Status', 'Type', 'Rating', 'Raters', 'Members', 'Started', 'Ended', 'Demographic', 'Serialization', 'Chapters']] = pd.DataFrame([['', '', '', '', '', '', '', '', '', '', '',]], index=df.index)
df
Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | https://myanimelist.net/manga/2/Berserk | |||||||||||
1 | https://myanimelist.net/manga/1706/JoJo_no_Kim... | |||||||||||
2 | https://myanimelist.net/manga/13/One_Piece | |||||||||||
3 | https://myanimelist.net/manga/656/Vagabond | |||||||||||
4 | https://myanimelist.net/manga/1/Monster | |||||||||||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
56402 | https://myanimelist.net/manga/88606/Triple_Ojo... | |||||||||||
56403 | https://myanimelist.net/manga/88607/Ore_ga♀de_... | |||||||||||
56404 | https://myanimelist.net/manga/88608/Ojousama_t... | |||||||||||
56405 | https://myanimelist.net/manga/88609/Neratta_On... | |||||||||||
56406 | https://myanimelist.net/manga/88610/Mazomai__M... |
56407 rows × 12 columns
df.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPages.csv', index = False)
#https://myanimelist.net/manga/2/Berserk
Mangainfo = requests.get('https://myanimelist.net/manga/2/Berserk')
Mangainfo_tree = BeautifulSoup(Mangainfo.content, "html.parser")
infolist = []
infolistclean = []
for info in Mangainfo_tree.find_all('div', {"class": "spaceit_pad"}):
infolist.append(info.get_text().strip())
for i in infolist:
splitter = ":"
if splitter in i:
splitlist = i.split(splitter, 1)
i = splitlist[1].strip()
infolistclean.append(i)
#pprint(infolistclean)
['Berserk', 'Berserk: The Prototype', 'ベルセルク', 'Manga', '41', '380', 'On Hiatus', 'Aug 25, 1989 to Sep 10, 2021', 'Action\n' '\n' ' Action, Adventure\n' '\n' ' Adventure, Drama\n' '\n' ' Drama, Fantasy\n' '\n' ' Fantasy, Horror\n' '\n' ' Horror, Supernatural\n' '\n' ' Supernatural', 'Demons\n' '\n' ' Demons, Military\n' '\n' ' Military, Psychological\n' '\n' ' Psychological', 'Seinen\n\n Seinen', 'Young Animal', 'Miura, Kentarou (Story & Art)', '9.411 (scored by 214699214,699 users)1 indicates a weighted score.', "#12 2 based on the top manga page. Please note that 'R18+' titles are " 'excluded.', '#2', '452,586', '84,928']
dffilled = pd.read_csv('MangaPages.csv')
columnstr = ['Name', 'Status', 'Type', 'Started', 'Demographic', 'Serialization']
for column in columnstr:
dffilled[column]=dffilled[column].astype(str)
def infofiller(row):
Mangainfo = requests.get(dffilled.at[row, 'Webpage'])
Mangainfo_tree = BeautifulSoup(Mangainfo.content, "html.parser")
infolist = []
dictclean = {}
for info in Mangainfo_tree.find_all('div', {"class": "spaceit_pad"}):
infolist.append(info.get_text().strip())
for i in infolist:
splitter = ":"
if splitter in i:
dictclean[(str(i.split(splitter, 1)[0]))] = (i.split(splitter, 1)[1]).strip()
if 'English' in dictclean.keys():
dffilled.at[row, 'Name'] = dictclean['English']
elif 'Synonyms' in dictclean.keys():
dffilled.at[row, 'Name'] = dictclean['Synonyms']
elif 'Japanese' in dictclean.keys():
dffilled.at[row, 'Name'] = dictclean['Japanese']
else:
dffilled.at[row, 'Name'] = np.nan
if 'Status' in dictclean.keys():
dffilled.at[row, 'Status'] = dictclean['Status']
else:
dffilled.at[row, 'Status'] = np.nan
dffilled.at[row, 'Type'] = dictclean['Type']
if 'N/A' in dictclean['Score']:
dffilled.at[row, 'Rating'] = np.nan
else:
dffilled.at[row, 'Rating'] = float(dictclean['Score'].split(' ', 1)[0].split('1', 1)[0])
raters = dictclean['Score'].split(' ', 4)[3].replace(",","")
raters = int(raters[:len(raters)//2])
dffilled.at[row, 'Raters'] = raters
members = int(dictclean['Members'].replace(",",""))
dffilled.at[row, 'Members'] = members
if dictclean['Published'] == 'Not available':
dffilled.at[row, 'Started'] = np.nan
dffilled.at[row, 'Ended'] = np.nan
elif dictclean['Published'] == 'N/A':
dffilled.at[row, 'Started'] = np.nan
dffilled.at[row, 'Ended'] = np.nan
else:
startdate = dictclean['Published'].split('to', 1)[0].replace(",","").strip().replace(" ","",1)
if len(startdate) < 5:
startdate2 = datetime.strptime(startdate, '%Y').date()
dffilled.at[row, 'Started'] = startdate2
elif len(startdate) < 9:
startdate2 = datetime.strptime(startdate, '%b %Y').date()
dffilled.at[row, 'Started'] = startdate2
else:
startdate2 = datetime.strptime(startdate, '%b %d %Y').date()
dffilled.at[row, 'Started'] = startdate2
if 'to' in dictclean['Published']:
enddate = dictclean['Published'].split('to', 1)[1].replace(",","").strip().replace(" ","",1)
else:
enddate = '?'
if enddate != '?':
if len(enddate) < 5:
enddate2 = datetime.strptime(enddate, '%Y').date()
dffilled.at[row, 'Ended'] = enddate2
elif len(enddate) < 9:
enddate2 = datetime.strptime(enddate, '%b %Y').date()
dffilled.at[row, 'Ended'] = enddate2
else:
enddate2 = datetime.strptime(enddate, '%b %d %Y').date()
dffilled.at[row, 'Ended'] = enddate2
else:
dffilled.at[row, 'Ended'] = np.nan
if 'Demographics' in dictclean.keys():
Dem = dictclean['Demographics'].split('\n', 1)[0]
dffilled.at[row, 'Demographic'] = Dem
elif 'Demographic' in dictclean.keys():
Dem = dictclean['Demographic'].split('\n', 1)[0]
dffilled.at[row, 'Demographic'] = Dem
else:
dffilled.at[row, 'Demographic'] = np.nan
dffilled.at[row, 'Serialization'] = dictclean['Serialization'].strip()
if dictclean['Chapters'] != 'Unknown':
dffilled.at[row, 'Chapters'] = int(dictclean['Chapters'].strip())
else:
dffilled.at[row, 'Chapters'] = np.nan
#indexlist1 = np.arange(8807, 26407, 40).tolist()
indexlist2 = np.arange(26406, 56407, 40).tolist()
errorlist = []
dffilled = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')
dffilled['Name'] = dffilled['Name'].astype(str)
dffilled['Status'] = dffilled['Status'].astype(str)
counter = 0
for index in indexlist2:
print(index)
indexstart = index - 40
for i, row in dffilled[indexstart:index].iterrows():
if dffilled.at[i, 'Name'] == 'nan':
try:
infofiller(i)
except (Exception) as e:
print('Error at index {}: {!r}'.format(i, row))
print(e)
errorlist.append(i)
time.sleep(180)
if counter % 10 == 0:
time.sleep(randint(10,35))
if counter % 1000 == 0:
print('we are at the' + str(i) + 'th value')
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp2.csv', index = False)
counter =+ 1
time.sleep(randint(30,70))
pd.set_option("display.max_rows", 30)
infofiller(9431)
dffilled[9430:9431]
print(len(errorlist))
print(errorlist)
textfile = open("a_file.txt", "w")
for element in errorlist:
textfile.write(str(element) + "\n")
textfile.close()
129 [26458, 27606, 27847, 28726, 29246, 29646, 30086, 30446, 31046, 31206, 31406, 31606, 31926, 32206, 33291, 33686, 34606, 35166, 35646, 35886, 37966, 38449, 38614, 38772, 39206, 39366, 39526, 39561, 39766, 39926, 40086, 40236, 40486, 40646, 40808, 41046, 41326, 41566, 41726, 41966, 42126, 42286, 42566, 42751, 43007, 43206, 43368, 43526, 43686, 43846, 44007, 44166, 44366, 44394, 44590, 44598, 44605, 44687, 44846, 45006, 45166, 45326, 45414, 45568, 45726, 45886, 46086, 46246, 46406, 46567, 46726, 46867, 47047, 47215, 47366, 47442, 47606, 47766, 47926, 48086, 48246, 48359, 48686, 48846, 49006, 49166, 49366, 49526, 49686, 49926, 50046, 50206, 50277, 50278, 50446, 50615, 50755, 50877, 51046, 51206, 51406, 51566, 51726, 51926, 52086, 52262, 52486, 52686, 52846, 53006, 53167, 53446, 53606, 53806, 54006, 54166, 54366, 54526, 54692, 54846, 54853, 55046, 55206, 55366, 55526, 55726, 56047, 56206, 56363]
print(len(errorlist))
errorlist2 = []
counter2 = 0
for i in errorlist:
print(i)
if dffilled.at[i, 'Name'] == 'nan':
try:
infofiller(i)
except (Exception) as e:
print('Error at index {}: {!r}'.format(i, row))
print(e)
errorlist2.append(i)
time.sleep(180)
if counter % 10 == 0:
time.sleep(randint(10,35))
if counter % 1000 == 0:
print('we are at the' + i + 'th value')
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv', index = False)
counter2 =+ 1
if counter%20 == 0:
time.sleep(randint())
print('+ 20 ezclap')
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv', index = False)
#dffilled = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')
dffilled['Name'] = dffilled['Name'].astype(str)
dffilled['Status'] = dffilled['Status'].astype(str)
nameerrors = dffilled[(dffilled['Name'] == 'nan') & (dffilled['Status'] != 'nan')]
nameerrors
Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
3052 | https://myanimelist.net/manga/7071/B-Eyes | nan | Finished | Manga | 7.43 | 425.0 | 865.0 | 2000-01-01 | NaN | Shoujo | None | 17.0 |
3170 | https://myanimelist.net/manga/13070/Wait_Wolf | nan | Finished | Manhwa | 7.42 | 460.0 | 1412.0 | NaN | NaN | NaN | None | NaN |
3488 | https://myanimelist.net/manga/4795/Rurouni_Ken... | nan | Finished | Light Novel | 7.38 | 437.0 | 1399.0 | 1996-10-01 | NaN | Shounen | None | 19.0 |
3580 | https://myanimelist.net/manga/3070/Shokugyou_T... | nan | Finished | Manga | 7.37 | 421.0 | 804.0 | 2003-01-01 | NaN | NaN | None | 4.0 |
3721 | https://myanimelist.net/manga/4657/Winter_Flowers | nan | Finished | One-shot | 7.35 | 1500.0 | 2520.0 | 2007-01-01 | NaN | Shoujo | Cheese! | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
48786 | https://myanimelist.net/manga/43335/Brother_X_... | nan | Finished | One-shot | NaN | NaN | 45.0 | NaN | NaN | NaN | None | 1.0 |
48876 | https://myanimelist.net/manga/45193/Hideout_ZIN | nan | Finished | One-shot | NaN | NaN | 192.0 | NaN | NaN | NaN | None | 1.0 |
48878 | https://myanimelist.net/manga/45227/Sweet_Drug | nan | Finished | One-shot | NaN | NaN | 130.0 | NaN | NaN | NaN | None | 1.0 |
48909 | https://myanimelist.net/manga/45615/Takatoh-sa... | nan | Finished | Manga | NaN | NaN | 83.0 | NaN | NaN | NaN | Magazine Be x Boy | 8.0 |
52587 | https://myanimelist.net/manga/59381/Switch_dj_... | nan | Finished | Doujinshi | NaN | NaN | 273.0 | 2009-08-16 | NaN | NaN | None | 1.0 |
604 rows × 12 columns
print(len(errorlist3))
print(errorlist3)
fullerrors2 = dffilled[(dffilled['Name'] == 'nan') & (dffilled['Status'] == 'nan')]
fullerrors2
169 [5739, 5908, 6078, 6254, 6278, 6448, 6625, 6626, 9497, 9667, 9836, 10011, 10029, 10184, 10343, 10344, 18366, 18532, 18694, 18695, 18869, 19041, 19213, 19214, 19379, 19568, 19741, 19896, 20078, 20263, 20436, 20597, 20744, 20904, 21045, 21214, 21215, 21373, 21547, 21722, 21893, 21894, 22071, 22245, 22418, 22580, 22581, 22749, 22918, 23078, 23079, 23251, 23433, 23591, 23592, 23765, 23925, 23926, 24102, 24276, 24334, 24504, 24505, 24678, 24800, 24957, 25014, 25174, 25175, 25345, 25493, 25494, 25505, 25669, 25843, 25844, 26006, 26007, 26169, 26334, 26335, 50046, 50277, 50278, 50615, 50755, 50877, 56363, 5580, 5737, 5900, 6062, 6231, 6278, 6451, 6615, 9476, 9477, 9645, 9799, 9959, 10029, 10202, 13367, 18223, 18397, 18398, 18570, 18738, 18928, 19090, 19091, 19251, 19425, 19609, 19610, 19783, 19938, 20109, 20292, 20367, 20537, 20538, 20719, 20898, 21059, 21236, 21395, 21564, 21727, 21889, 21890, 22002, 22169, 22335, 22501, 22660, 22818, 22992, 23154, 23310, 23453, 23618, 23777, 23930, 24085, 24247, 24334, 24498, 24670, 24800, 24962, 25014, 25185, 25340, 25480, 25505, 25665, 25819, 25998, 26148, 26318, 50046, 50277, 50278, 50615, 50755, 50877, 56363]
Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
20367 | https://myanimelist.net/manga/121902/Hiiragi-s... | nan | nan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
20538 | https://myanimelist.net/manga/123086/Egao_no_T... | nan | nan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
22002 | https://myanimelist.net/manga/132258/Botsuichi | nan | nan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
dffilled2 = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')
dffilled2['Name'] = dffilled2['Name'].astype(str)
dffilled2['Status'] = dffilled2['Status'].astype(str)
fullerrors3 = dffilled2[(dffilled['Name'] == 'nan') & (dffilled2['Status'] == 'nan')]
lasterrorslist = [20367, 20538, 22002]
counter2 = 0
for ind in lasterrorslist:
print(ind)
if dffilled.at[i, 'Name'] == 'nan':
print(i)
try:
infofiller(i)
except (Exception) as e:
print(e)
#errorlist4.append(ind)
time.sleep(180)
if counter2 % 10 == 0:
time.sleep(randint(10,35))
if counter2 % 1000 == 0:
print('we are at the' + str(ind) + 'th value')
counter2 =+ 1
if counter2%20 == 0:
time.sleep(randint())
print('+ 20 ezclap')
20367 20538 22002
infofiller(20367)
infofiller(20538)
infofiller(22002)
dffilled.loc[[20367, 20538, 22002]]
Webpage | Name | Status | Type | Rating | Raters | Members | Started | Ended | Demographic | Serialization | Chapters | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
20367 | https://myanimelist.net/manga/121902/Hiiragi-s... | Everyday in a Vampire Family | Finished | One-shot | 6.68 | 145.0 | 387.0 | 2019-09-11 | NaN | Shounen | Shounen Magazine (Weekly) | 1.0 |
20538 | https://myanimelist.net/manga/123086/Egao_no_T... | A Workplace Where You Can't Help But Smile | Publishing | Manga | NaN | NaN | 307.0 | 2019-05-18 | NaN | NaN | Comic Days | NaN |
22002 | https://myanimelist.net/manga/132258/Botsuichi | 没イチ | Publishing | Manga | NaN | NaN | 16.0 | 2020-12-08 | NaN | Seinen | Evening | NaN |
dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp3.csv', index = False)