diff --git a/core.py b/core.py
index 5ba1432..2b241ba 100755
--- a/core.py
+++ b/core.py
@@ -423,7 +423,7 @@ def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filep
print(" " + number + "", file=code)
print(" " + release + "", file=code)
print(" " + cover + "", file=code)
- print(" " + "https://www.javbus.com/" + number + "", file=code)
+ print(" " + website + "", file=code)
print("", file=code)
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
elif option == 'kodi':
@@ -470,7 +470,7 @@ def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filep
print(" " + number + "", file=code)
print(" " + release + "", file=code)
print(" " + cover + "", file=code)
- print(" " + "https://www.javbus.com/" + number + "", file=code)
+ print(" " + website + "", file=code)
print("", file=code)
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
except IOError as e:
diff --git a/fanza.py b/fanza.py
index 7edf6d9..78df8fb 100644
--- a/fanza.py
+++ b/fanza.py
@@ -1,121 +1,226 @@
-#!/usr/bin/python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
-import re
-from lxml import etree
import json
+import re
+
+from lxml import etree
+
from ADC_function import *
+
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-def getTitle(a):
- html = etree.fromstring(a, etree.HTMLParser())
+
+def getTitle(text):
+ html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[@id="title"]/text()')[0]
return result
-def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
- html = etree.fromstring(a, etree.HTMLParser())
- result = str(html.xpath("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()")).strip(" ['']").replace("', '",',')
+
+
+def getActor(text):
+ # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+ html = etree.fromstring(text, etree.HTMLParser())
+ result = (
+ str(
+ html.xpath(
+ "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
+ )
+ )
+ .strip(" ['']")
+ .replace("', '", ",")
+ )
return result
-def getStudio(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+
+
+def getStudio(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
- result1 = html.xpath("//td[contains(text(),'メーカー')]/following-sibling::td/a/text()")[0]
+ result = html.xpath(
+ "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
+ )[0]
except:
- result1 = html.xpath("//td[contains(text(),'メーカー')]/following-sibling::td/text()")[0]
- return result1
-def getRuntime(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
- return re.search('\d+', str(result1)).group()
-def getLabel(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ result = html.xpath(
+ "//td[contains(text(),'メーカー')]/following-sibling::td/text()"
+ )[0]
+ return result
+
+
+def getRuntime(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
+ return re.search(r"\d+", str(result)).group()
+
+
+def getLabel(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
- result1 = html.xpath("//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()")[0]
+ result = html.xpath(
+ "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
+ )[0]
except:
- result1 = html.xpath("//td[contains(text(),'シリーズ:')]/following-sibling::td/text()")[0]
- return result1
-def getNum(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ result = html.xpath(
+ "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
+ )[0]
+ return result
+
+
+def getNum(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
- result1 = html.xpath("//td[contains(text(),'品番:')]/following-sibling::td/a/text()")[0]
+ result = html.xpath(
+ "//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
+ )[0]
except:
- result1 = html.xpath("//td[contains(text(),'品番:')]/following-sibling::td/text()")[0]
- return result1
+ result = html.xpath(
+ "//td[contains(text(),'品番:')]/following-sibling::td/text()"
+ )[0]
+ return result
+
+
def getYear(getRelease):
try:
- result = str(re.search('\d{4}',getRelease).group())
+ result = str(re.search(r"\d{4}", getRelease).group())
return result
except:
return getRelease
-def getRelease(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+
+
+def getRelease(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
- result1 = html.xpath("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()")[0].lstrip('\n')
+ result = html.xpath(
+ "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
+ )[0].lstrip("\n")
except:
- result1 = html.xpath("//td[contains(text(),'発売日:')]/following-sibling::td/text()")[0].lstrip('\n')
- return result1
-def getTag(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- try:
- result1 = html.xpath("//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()")
- except:
- result1 = html.xpath("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()")
- return result1
-def getCover(htmlcode,number):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = html.xpath('//*[@id="'+number+'"]/@href')[0]
+ result = html.xpath(
+ "//td[contains(text(),'発売日:')]/following-sibling::td/text()"
+ )[0].lstrip("\n")
return result
-def getDirector(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+
+
+def getTag(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
- result1 = html.xpath("//td[contains(text(),'監督:')]/following-sibling::td/a/text()")[0]
+ result = html.xpath(
+ "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
+ )
except:
- result1 = html.xpath("//td[contains(text(),'監督:')]/following-sibling::td/text()")[0]
- return result1
-def getOutline(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace('\n','')
+ result = html.xpath(
+ "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
+ )
return result
+
+
+def getCover(text, number):
+ html = etree.fromstring(text, etree.HTMLParser())
+ cover_number = number
+ if "_" in cover_number:
+ # fanza modify _ to \u0005f for image id
+ cover_number = cover_number.replace("_", r"\u005f")
+ try:
+ result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
+ except:
+ # (TODO) handle more edge case
+ # print(html)
+ # raise exception here, same behavior as before
+ # people's major requirement is fetching the picture
+ raise ValueError("can not find image")
+ return result
+
+
+def getDirector(text):
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ try:
+ result = html.xpath(
+ "//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
+ )[0]
+ except:
+ result = html.xpath(
+ "//td[contains(text(),'監督:')]/following-sibling::td/text()"
+ )[0]
+ return result
+
+
+def getOutline(text):
+ html = etree.fromstring(text, etree.HTMLParser())
+ try:
+ result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
+ "\n", ""
+ )
+ if result == "":
+ result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
+ "\n", ""
+ )
+ except:
+ # (TODO) handle more edge case
+ # print(html)
+ return ""
+ return result
+
+
def main(number):
- htmlcode=get_html('https://www.dmm.co.jp/digital/videoa/-/detail/=/cid='+number)
- url = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid='+number
- if '404 Not Found' in htmlcode:
- htmlcode=get_html('https://www.dmm.co.jp/mono/dvd/-/detail/=/cid='+number)
- url = 'https://www.dmm.co.jp/mono/dvd/-/detail/=/cid='+number
- if '404 Not Found' in htmlcode:
- # neither digital nor mono is available
- return json.dumps({'title': '',})
+ # fanza allow letter + number + underscore, normalize the input here
+ # @note: I only find the usage of underscore as h_test123456789
+ fanza_search_number = number
+ # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
+ if fanza_search_number.startswith("h-"):
+ fanza_search_number = fanza_search_number.replace("h-", "h_")
+
+ fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
+
+ fanza_urls = [
+ "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
+ "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
+ "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
+ "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
+ ]
+ chosen_url = ""
+ for url in fanza_urls:
+ chosen_url = url + fanza_search_number
+ htmlcode = get_html(chosen_url)
+ if "404 Not Found" not in htmlcode:
+ break
+ if "404 Not Found" in htmlcode:
+ return json.dumps({"title": "",})
try:
# for some old page, the input number does not match the page
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
- number = getNum(htmlcode)
- dic = {
- 'title': getTitle(htmlcode).strip(getActor(htmlcode)),
- 'studio': getStudio(htmlcode),
- 'outline': getOutline(htmlcode),
- 'runtime': getRuntime(htmlcode),
- 'director': getDirector(htmlcode),
- 'actor': getActor(htmlcode),
- 'release': getRelease(htmlcode),
- 'number': number,
- 'cover': getCover(htmlcode, number),
- 'imagecut': 1,
- 'tag': getTag(htmlcode),
- 'label':getLabel(htmlcode),
- 'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
- 'actor_photo': '',
- 'website': url,
- 'source': 'fanza.py',
+ fanza_hinban = getNum(htmlcode)
+ data = {
+ "title": getTitle(htmlcode).strip(getActor(htmlcode)),
+ "studio": getStudio(htmlcode),
+ "outline": getOutline(htmlcode),
+ "runtime": getRuntime(htmlcode),
+ "director": getDirector(htmlcode) if "anime" not in chosen_url else "",
+ "actor": getActor(htmlcode) if "anime" not in chosen_url else "",
+ "release": getRelease(htmlcode),
+ "number": fanza_hinban,
+ "cover": getCover(htmlcode, fanza_hinban),
+ "imagecut": 1,
+ "tag": getTag(htmlcode),
+ "label": getLabel(htmlcode),
+ "year": getYear(
+ getRelease(htmlcode)
+ ), # str(re.search('\d{4}',getRelease(a)).group()),
+ "actor_photo": "",
+ "website": chosen_url,
+ "source": "fanza.py",
}
- except :
- dic = {
- 'title': '',
+ except:
+ data = {
+ "title": "",
}
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8')
+ js = json.dumps(
+ data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+ ) # .encode('UTF-8')
return js
-# main('DV-1562')
-# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
-#print(main('ipx292'))
+
+if __name__ == "__main__":
+ # print(main("DV-1562"))
+ # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
+ # print(main("ipx292"))
+ pass