rewrite fanza.py
This commit is contained in:
parent
690557f878
commit
5f46f3f25d
4
core.py
4
core.py
@ -423,7 +423,7 @@ def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filep
|
|||||||
print(" <num>" + number + "</num>", file=code)
|
print(" <num>" + number + "</num>", file=code)
|
||||||
print(" <premiered>" + release + "</premiered>", file=code)
|
print(" <premiered>" + release + "</premiered>", file=code)
|
||||||
print(" <cover>" + cover + "</cover>", file=code)
|
print(" <cover>" + cover + "</cover>", file=code)
|
||||||
print(" <website>" + "https://www.javbus.com/" + number + "</website>", file=code)
|
print(" <website>" + website + "</website>", file=code)
|
||||||
print("</movie>", file=code)
|
print("</movie>", file=code)
|
||||||
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
|
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
|
||||||
elif option == 'kodi':
|
elif option == 'kodi':
|
||||||
@ -470,7 +470,7 @@ def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filep
|
|||||||
print(" <num>" + number + "</num>", file=code)
|
print(" <num>" + number + "</num>", file=code)
|
||||||
print(" <release>" + release + "</release>", file=code)
|
print(" <release>" + release + "</release>", file=code)
|
||||||
print(" <cover>" + cover + "</cover>", file=code)
|
print(" <cover>" + cover + "</cover>", file=code)
|
||||||
print(" <website>" + "https://www.javbus.com/" + number + "</website>", file=code)
|
print(" <website>" + website + "</website>", file=code)
|
||||||
print("</movie>", file=code)
|
print("</movie>", file=code)
|
||||||
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
|
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
|
||||||
except IOError as e:
|
except IOError as e:
|
||||||
|
273
fanza.py
273
fanza.py
@ -1,121 +1,226 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import re
|
|
||||||
from lxml import etree
|
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
|
|
||||||
# import sys
|
# import sys
|
||||||
# import io
|
# import io
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||||
|
|
||||||
def getTitle(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
def getTitle(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser())
|
||||||
result = html.xpath('//*[@id="title"]/text()')[0]
|
result = html.xpath('//*[@id="title"]/text()')[0]
|
||||||
return result
|
return result
|
||||||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result = str(html.xpath("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()")).strip(" ['']").replace("', '",',')
|
def getActor(text):
|
||||||
|
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser())
|
||||||
|
result = (
|
||||||
|
str(
|
||||||
|
html.xpath(
|
||||||
|
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.strip(" ['']")
|
||||||
|
.replace("', '", ",")
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
def getStudio(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
|
def getStudio(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
try:
|
try:
|
||||||
result1 = html.xpath("//td[contains(text(),'メーカー')]/following-sibling::td/a/text()")[0]
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
|
||||||
|
)[0]
|
||||||
except:
|
except:
|
||||||
result1 = html.xpath("//td[contains(text(),'メーカー')]/following-sibling::td/text()")[0]
|
result = html.xpath(
|
||||||
return result1
|
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
|
||||||
def getRuntime(a):
|
)[0]
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
return result
|
||||||
result1 = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
|
|
||||||
return re.search('\d+', str(result1)).group()
|
|
||||||
def getLabel(a):
|
def getRuntime(text):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
|
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
|
||||||
|
return re.search(r"\d+", str(result)).group()
|
||||||
|
|
||||||
|
|
||||||
|
def getLabel(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
try:
|
try:
|
||||||
result1 = html.xpath("//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()")[0]
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
|
||||||
|
)[0]
|
||||||
except:
|
except:
|
||||||
result1 = html.xpath("//td[contains(text(),'シリーズ:')]/following-sibling::td/text()")[0]
|
result = html.xpath(
|
||||||
return result1
|
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
|
||||||
def getNum(a):
|
)[0]
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def getNum(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
try:
|
try:
|
||||||
result1 = html.xpath("//td[contains(text(),'品番:')]/following-sibling::td/a/text()")[0]
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
|
||||||
|
)[0]
|
||||||
except:
|
except:
|
||||||
result1 = html.xpath("//td[contains(text(),'品番:')]/following-sibling::td/text()")[0]
|
result = html.xpath(
|
||||||
return result1
|
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
|
||||||
|
)[0]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def getYear(getRelease):
|
def getYear(getRelease):
|
||||||
try:
|
try:
|
||||||
result = str(re.search('\d{4}',getRelease).group())
|
result = str(re.search(r"\d{4}", getRelease).group())
|
||||||
return result
|
return result
|
||||||
except:
|
except:
|
||||||
return getRelease
|
return getRelease
|
||||||
def getRelease(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
|
def getRelease(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
try:
|
try:
|
||||||
result1 = html.xpath("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()")[0].lstrip('\n')
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
|
||||||
|
)[0].lstrip("\n")
|
||||||
except:
|
except:
|
||||||
result1 = html.xpath("//td[contains(text(),'発売日:')]/following-sibling::td/text()")[0].lstrip('\n')
|
result = html.xpath(
|
||||||
return result1
|
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
|
||||||
def getTag(a):
|
)[0].lstrip("\n")
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result1 = html.xpath("//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()")
|
|
||||||
except:
|
|
||||||
result1 = html.xpath("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()")
|
|
||||||
return result1
|
|
||||||
def getCover(htmlcode,number):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = html.xpath('//*[@id="'+number+'"]/@href')[0]
|
|
||||||
return result
|
return result
|
||||||
def getDirector(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
|
def getTag(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
try:
|
try:
|
||||||
result1 = html.xpath("//td[contains(text(),'監督:')]/following-sibling::td/a/text()")[0]
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
result1 = html.xpath("//td[contains(text(),'監督:')]/following-sibling::td/text()")[0]
|
result = html.xpath(
|
||||||
return result1
|
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
|
||||||
def getOutline(htmlcode):
|
)
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace('\n','')
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def getCover(text, number):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser())
|
||||||
|
cover_number = number
|
||||||
|
if "_" in cover_number:
|
||||||
|
# fanza modify _ to \u0005f for image id
|
||||||
|
cover_number = cover_number.replace("_", r"\u005f")
|
||||||
|
try:
|
||||||
|
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
||||||
|
except:
|
||||||
|
# (TODO) handle more edge case
|
||||||
|
# print(html)
|
||||||
|
# raise exception here, same behavior as before
|
||||||
|
# people's major requirement is fetching the picture
|
||||||
|
raise ValueError("can not find image")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def getDirector(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
|
try:
|
||||||
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
|
||||||
|
)[0]
|
||||||
|
except:
|
||||||
|
result = html.xpath(
|
||||||
|
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
|
||||||
|
)[0]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def getOutline(text):
|
||||||
|
html = etree.fromstring(text, etree.HTMLParser())
|
||||||
|
try:
|
||||||
|
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
|
||||||
|
"\n", ""
|
||||||
|
)
|
||||||
|
if result == "":
|
||||||
|
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
|
||||||
|
"\n", ""
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
# (TODO) handle more edge case
|
||||||
|
# print(html)
|
||||||
|
return ""
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def main(number):
|
def main(number):
|
||||||
htmlcode=get_html('https://www.dmm.co.jp/digital/videoa/-/detail/=/cid='+number)
|
# fanza allow letter + number + underscore, normalize the input here
|
||||||
url = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid='+number
|
# @note: I only find the usage of underscore as h_test123456789
|
||||||
if '404 Not Found' in htmlcode:
|
fanza_search_number = number
|
||||||
htmlcode=get_html('https://www.dmm.co.jp/mono/dvd/-/detail/=/cid='+number)
|
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
||||||
url = 'https://www.dmm.co.jp/mono/dvd/-/detail/=/cid='+number
|
if fanza_search_number.startswith("h-"):
|
||||||
if '404 Not Found' in htmlcode:
|
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
||||||
# neither digital nor mono is available
|
|
||||||
return json.dumps({'title': '',})
|
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
||||||
|
|
||||||
|
fanza_urls = [
|
||||||
|
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
||||||
|
]
|
||||||
|
chosen_url = ""
|
||||||
|
for url in fanza_urls:
|
||||||
|
chosen_url = url + fanza_search_number
|
||||||
|
htmlcode = get_html(chosen_url)
|
||||||
|
if "404 Not Found" not in htmlcode:
|
||||||
|
break
|
||||||
|
if "404 Not Found" in htmlcode:
|
||||||
|
return json.dumps({"title": "",})
|
||||||
try:
|
try:
|
||||||
# for some old page, the input number does not match the page
|
# for some old page, the input number does not match the page
|
||||||
# for example, the url will be cid=test012
|
# for example, the url will be cid=test012
|
||||||
# but the hinban on the page is test00012
|
# but the hinban on the page is test00012
|
||||||
# so get the hinban first, and then pass it to following functions
|
# so get the hinban first, and then pass it to following functions
|
||||||
number = getNum(htmlcode)
|
fanza_hinban = getNum(htmlcode)
|
||||||
dic = {
|
data = {
|
||||||
'title': getTitle(htmlcode).strip(getActor(htmlcode)),
|
"title": getTitle(htmlcode).strip(getActor(htmlcode)),
|
||||||
'studio': getStudio(htmlcode),
|
"studio": getStudio(htmlcode),
|
||||||
'outline': getOutline(htmlcode),
|
"outline": getOutline(htmlcode),
|
||||||
'runtime': getRuntime(htmlcode),
|
"runtime": getRuntime(htmlcode),
|
||||||
'director': getDirector(htmlcode),
|
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
|
||||||
'actor': getActor(htmlcode),
|
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
|
||||||
'release': getRelease(htmlcode),
|
"release": getRelease(htmlcode),
|
||||||
'number': number,
|
"number": fanza_hinban,
|
||||||
'cover': getCover(htmlcode, number),
|
"cover": getCover(htmlcode, fanza_hinban),
|
||||||
'imagecut': 1,
|
"imagecut": 1,
|
||||||
'tag': getTag(htmlcode),
|
"tag": getTag(htmlcode),
|
||||||
'label':getLabel(htmlcode),
|
"label": getLabel(htmlcode),
|
||||||
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
|
"year": getYear(
|
||||||
'actor_photo': '',
|
getRelease(htmlcode)
|
||||||
'website': url,
|
), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||||
'source': 'fanza.py',
|
"actor_photo": "",
|
||||||
|
"website": chosen_url,
|
||||||
|
"source": "fanza.py",
|
||||||
}
|
}
|
||||||
except :
|
except:
|
||||||
dic = {
|
data = {
|
||||||
'title': '',
|
"title": "",
|
||||||
}
|
}
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8')
|
js = json.dumps(
|
||||||
|
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||||
|
) # .encode('UTF-8')
|
||||||
return js
|
return js
|
||||||
|
|
||||||
# main('DV-1562')
|
|
||||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
if __name__ == "__main__":
|
||||||
#print(main('ipx292'))
|
# print(main("DV-1562"))
|
||||||
|
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
||||||
|
# print(main("ipx292"))
|
||||||
|
pass
|
||||||
|
Loading…
Reference in New Issue
Block a user