Update TwitToMast.js
adjust docs adjust some arguments add argument to detect if running from loop fixed debuglog() function add code to include tweet header cleaned up tweetText formatting
This commit is contained in:
parent
eccfe549fe
commit
3084c0e54f
1 changed files with 71 additions and 49 deletions
|
@ -16,10 +16,10 @@ const Q = require("q");
|
||||||
const args = process.argv;
|
const args = process.argv;
|
||||||
if (args[2] == "-h"){
|
if (args[2] == "-h"){
|
||||||
console.log("usage: $node ./TwitToMast.js [username] [tweet count] [debug level] [disable posts]");
|
console.log("usage: $node ./TwitToMast.js [username] [tweet count] [debug level] [disable posts]");
|
||||||
console.log(" username: (string) username of account to scrape");
|
console.log(" username: (string) username of account to scrape - required");
|
||||||
console.log(" tweet count: (integer) number of tweets to scrape");
|
console.log(" tweet count: (integer) number of tweets to scrape - required");
|
||||||
console.log(" debug level: (0-2) amount of information to print to console");
|
console.log(" debug level: (0-2) amount of information to print to console - defaults to 0");
|
||||||
console.log(" disable posts: ('noWrite') disable posting to Mastodon");
|
console.log(" disable posts: ('write','noWrite') enable/disable posting to Mastodon - defaults to enable");
|
||||||
console.log(" ");
|
console.log(" ");
|
||||||
console.log(" config.txt:");
|
console.log(" config.txt:");
|
||||||
console.log(" API_KEY");
|
console.log(" API_KEY");
|
||||||
|
@ -43,13 +43,13 @@ if (isNaN(parseInt(args[3]))){
|
||||||
console.log("for help: $TwitToMast.js -h");
|
console.log("for help: $TwitToMast.js -h");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
if (!((args[4] >= 0) && (args[4] <= 2)) && (typeof args[4] != 'undefined')){
|
if (!((parseInt(args[4]) >= 0) && (parseInt(args[4]) <= 2)) && (typeof args[4] != 'undefined')){
|
||||||
console.log("Expected [0-2], got '" + args[4] + "' instead");
|
console.log("Expected [0-2], got '" + args[4] + "' instead");
|
||||||
console.log("for help: $TwitToMast.js -h");
|
console.log("for help: $TwitToMast.js -h");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
if (args[5] != 'noWrite' && typeof args[5] != 'undefined') {
|
if ((args[5] != 'noWrite' && args[5] != 'write') && typeof args[5] != 'undefined') {
|
||||||
console.log("Expected 'noWrite' or undefined, got '" + args[5] + "' instead");
|
console.log("Expected 'noWrite', 'write', or undefined, got '" + args[5] + "' instead");
|
||||||
console.log("for help: $TwitToMast.js -h");
|
console.log("for help: $TwitToMast.js -h");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
@ -87,6 +87,12 @@ if (typeof args[5] == 'undefined') {
|
||||||
} else if (args[5] == 'noWrite') {
|
} else if (args[5] == 'noWrite') {
|
||||||
disablePosts = true;
|
disablePosts = true;
|
||||||
}
|
}
|
||||||
|
var fromLoop = false;
|
||||||
|
if (args[6] == 'fromLoop'){
|
||||||
|
fromLoop = true;
|
||||||
|
} else {
|
||||||
|
fromLoop = false;
|
||||||
|
}
|
||||||
debuglog(args,2);
|
debuglog(args,2);
|
||||||
debuglog("userName: " + userName,2);
|
debuglog("userName: " + userName,2);
|
||||||
debuglog("maxTweetScan: " + maxTweetScan,2);
|
debuglog("maxTweetScan: " + maxTweetScan,2);
|
||||||
|
@ -113,7 +119,7 @@ function downloadImage(url, filepath) {
|
||||||
|
|
||||||
function debuglog(debugString,logLevel) {
|
function debuglog(debugString,logLevel) {
|
||||||
prefix = "";
|
prefix = "";
|
||||||
switch (debug) {
|
switch (logLevel) {
|
||||||
case 0:
|
case 0:
|
||||||
prefix = "";
|
prefix = "";
|
||||||
break;
|
break;
|
||||||
|
@ -148,6 +154,7 @@ debuglog("API_URL: " + config[1],1);
|
||||||
debuglog("Enable Quote Tweets: " + modulesToEnable[0],1);
|
debuglog("Enable Quote Tweets: " + modulesToEnable[0],1);
|
||||||
debuglog("Enable Thread Tweets: " + modulesToEnable[1],1);
|
debuglog("Enable Thread Tweets: " + modulesToEnable[1],1);
|
||||||
debuglog("Disable posting to Mastodon: " + disablePosts,1);
|
debuglog("Disable posting to Mastodon: " + disablePosts,1);
|
||||||
|
debuglog("running from loop: " + fromLoop,1);
|
||||||
|
|
||||||
//SETUP REMAINDER OF VARIABLES
|
//SETUP REMAINDER OF VARIABLES
|
||||||
|
|
||||||
|
@ -163,6 +170,10 @@ const tweetXPath = (timeLineXPath + `/div`); //the div containing individual twe
|
||||||
|
|
||||||
const urlCardXPath = `/div/div/div/article/div/div/div/div[*]/div[*]/div[*]/div[*]/div/div[2]/a`
|
const urlCardXPath = `/div/div/div/article/div/div/div/div[*]/div[*]/div[*]/div[*]/div/div[2]/a`
|
||||||
|
|
||||||
|
const tweeterHandle = `/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div/div/div[2]/div/div[1]/a/div/span[contains(text(),"@")]` //text label containing tweeter's handle
|
||||||
|
|
||||||
|
const tweeterName = `/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div/div/div[1]/div/a/div/div[1]/span/span` //text label containing tweeter's name
|
||||||
|
|
||||||
const quoteTweetHandleXPath = `/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[2]/div[*]/div[2]/div/div[1]/div/div/div/div/div/div[2]/div[1]/div/div/div/span`; //xpath to text label that reveals if a tweet is a quote tweet (leads to the quote tweeted user's handle)
|
const quoteTweetHandleXPath = `/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[2]/div[*]/div[2]/div/div[1]/div/div/div/div/div/div[2]/div[1]/div/div/div/span`; //xpath to text label that reveals if a tweet is a quote tweet (leads to the quote tweeted user's handle)
|
||||||
|
|
||||||
const quoteTweetContentXPath= `/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[2]/div[*]/div[2][div/div[1]/div/div/div/div/div/div[2]/div[1]/div/div/div/span]` //xpath to locate entirety of Quote Tweeted Content
|
const quoteTweetContentXPath= `/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[2]/div[*]/div[2][div/div[1]/div/div/div/div/div/div[2]/div[1]/div/div/div/span]` //xpath to locate entirety of Quote Tweeted Content
|
||||||
|
@ -173,7 +184,7 @@ const threadIndicatorXPath = `/div/div/div/article/div/a/div/div[2]/div/span`; /
|
||||||
|
|
||||||
const tweetTextXPath = `//div[@data-testid="tweetText"]`; //xpath that leads to div containing all tweet text
|
const tweetTextXPath = `//div[@data-testid="tweetText"]`; //xpath that leads to div containing all tweet text
|
||||||
|
|
||||||
const tweetURLXPath = `//div/a[contains(@href, 'status')]`; //xpath to image that reveals if a tweet is a part of a thread
|
const tweetURLXPath = `//div/a[contains(@href, 'status')]`; //xpath to tweet url
|
||||||
|
|
||||||
const singleImageXPath = `//div[2]/div/img[@alt="Image"]`; //xpath to image that reveals if a tweet has one image
|
const singleImageXPath = `//div[2]/div/img[@alt="Image"]`; //xpath to image that reveals if a tweet has one image
|
||||||
|
|
||||||
|
@ -288,6 +299,7 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
}
|
}
|
||||||
|
|
||||||
//GET TWEET URL
|
//GET TWEET URL
|
||||||
|
await driver.wait(until.elementLocated(By.xpath(thisTweetXPath + tweetURLXPath)), 1000);
|
||||||
mobileTweetURL = await driver.findElement(By.xpath(thisTweetXPath + tweetURLXPath)).getAttribute('href');
|
mobileTweetURL = await driver.findElement(By.xpath(thisTweetXPath + tweetURLXPath)).getAttribute('href');
|
||||||
tweetURL = await mobileTweetURL.replace('mobile.','');
|
tweetURL = await mobileTweetURL.replace('mobile.','');
|
||||||
debuglog(tweetURL,2);
|
debuglog(tweetURL,2);
|
||||||
|
@ -297,7 +309,16 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
//SETUP TEXT FOR TWEET STATUS
|
//SETUP TEXT FOR TWEET STATUS
|
||||||
var tweetHasText = false;
|
var tweetHasText = false;
|
||||||
await driver.wait(until.elementLocated(By.xpath(timeLineXPath + tweetTextXPath)), 1000);
|
await driver.wait(until.elementLocated(By.xpath(timeLineXPath + tweetTextXPath)), 1000);
|
||||||
|
tweetText = ""
|
||||||
|
|
||||||
|
//IS TWEET PART OF MULTISCRAPER, IF SO ADD HEADER
|
||||||
|
if (fromLoop) {
|
||||||
|
tweeterHandleText = await driver.findElement(By.xpath(thisTweetXPath + tweeterHandle)).getText();
|
||||||
|
tweeterNameText = await driver.findElement(By.xpath(thisTweetXPath + tweeterName)).getText();
|
||||||
|
tweetText = (tweeterNameText + " (" + tweeterHandleText + ")\r\n" + tweetURL + "\r\n\r\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
//DOES TWEET HAVE TEXT
|
||||||
tweetHasText = await driver.findElement(webdriver.By.xpath(thisTweetXPath + tweetTextXPath)).then(function() {
|
tweetHasText = await driver.findElement(webdriver.By.xpath(thisTweetXPath + tweetTextXPath)).then(function() {
|
||||||
return true; // It existed
|
return true; // It existed
|
||||||
}, function(err) {
|
}, function(err) {
|
||||||
|
@ -307,13 +328,12 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
webdriver.promise.rejected(err);
|
webdriver.promise.rejected(err);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
//IF SO, ADD BODY TEXT TO TWEET TEXT
|
||||||
if (tweetHasText){
|
if (tweetHasText){
|
||||||
tweetText = await driver.findElement(By.xpath(thisTweetXPath + tweetTextXPath)).getText();
|
tweetText = tweetText + await driver.findElement(By.xpath(thisTweetXPath + tweetTextXPath)).getText();
|
||||||
//debuglog("Tweet Text: " + tweetText,2);
|
|
||||||
} else {
|
|
||||||
tweetText = " ";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//DOES TWEET HAVE A URL CARD
|
||||||
tweetHasURL = await driver.findElement(webdriver.By.xpath(thisTweetXPath + urlCardXPath)).then(function() {
|
tweetHasURL = await driver.findElement(webdriver.By.xpath(thisTweetXPath + urlCardXPath)).then(function() {
|
||||||
return true; // It existed
|
return true; // It existed
|
||||||
}, function(err) {
|
}, function(err) {
|
||||||
|
@ -323,19 +343,17 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
webdriver.promise.rejected(err);
|
webdriver.promise.rejected(err);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
//IF SO, ADD URL TO TWEET TEXT
|
||||||
if (tweetHasURL){
|
if (tweetHasURL){
|
||||||
tweetCardURL = await driver.findElement(By.xpath(thisTweetXPath + urlCardXPath)).getAttribute('href');
|
tweetCardURL = await driver.findElement(By.xpath(thisTweetXPath + urlCardXPath)).getAttribute('href');
|
||||||
await expandUrl(tweetCardURL)
|
await expandUrl(tweetCardURL)
|
||||||
.then(function (longUrl) {
|
.then(function (longUrl) {
|
||||||
debuglog("Long URL:" + longUrl,2);
|
debuglog("Long URL:" + longUrl,2);
|
||||||
tweetText = tweetText + "\r\n\r\n" + longUrl;
|
tweetText = tweetText + "\r\n\r\n" + longUrl;
|
||||||
debuglog("Tweet Text: " + tweetText,2);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
debuglog("tweetHasText: " + tweetHasText,2);
|
|
||||||
debuglog("tweetHasURL: " + tweetHasURL,2);
|
|
||||||
|
|
||||||
//CHECK FOR QUOTE TWEETS
|
//IS TWEET A QUOTE TWEET
|
||||||
isQT = await driver.findElement(webdriver.By.xpath(thisTweetXPath + quoteTweetContentXPath)).then(function() {
|
isQT = await driver.findElement(webdriver.By.xpath(thisTweetXPath + quoteTweetContentXPath)).then(function() {
|
||||||
return true; // It existed
|
return true; // It existed
|
||||||
}, function(err) {
|
}, function(err) {
|
||||||
|
@ -345,6 +363,7 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
//webdriver.promise.rejected(err);
|
//webdriver.promise.rejected(err);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
//IF SO, ADD QUOTE TWEET LINK TO TWEET TEXT
|
||||||
if (isQT){
|
if (isQT){
|
||||||
await driver.sleep(1 * 1000)
|
await driver.sleep(1 * 1000)
|
||||||
quotedContent = await driver.findElement(webdriver.By.xpath(thisTweetXPath + quoteTweetContentXPath));
|
quotedContent = await driver.findElement(webdriver.By.xpath(thisTweetXPath + quoteTweetContentXPath));
|
||||||
|
@ -363,6 +382,8 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
await driver.switchTo().window(parent);
|
await driver.switchTo().window(parent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debuglog(tweetText,1)
|
||||||
|
|
||||||
//CODE TO RUN IF TWEET IS NOT IN CSV
|
//CODE TO RUN IF TWEET IS NOT IN CSV
|
||||||
debuglog("Tweet #" + i + " has not been processed.", 1);
|
debuglog("Tweet #" + i + " has not been processed.", 1);
|
||||||
|
|
||||||
|
@ -479,7 +500,7 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
}
|
}
|
||||||
|
|
||||||
//REMOVE SAVED IMAGE FILES
|
//REMOVE SAVED IMAGE FILES
|
||||||
debuglog("cleaning up...",1);
|
debuglog("Cleaning up...",1);
|
||||||
for (var j = 1; j < 5; j++) {
|
for (var j = 1; j < 5; j++) {
|
||||||
path = ("./" + i + "." + j + ".jpg");
|
path = ("./" + i + "." + j + ".jpg");
|
||||||
try {
|
try {
|
||||||
|
@ -502,6 +523,7 @@ driver.executeScript("document.body.style.zoom='35%'");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debuglog("Finished scraping " + userName + "'s tweets",1)
|
||||||
//EXIT WEBDRIVER
|
//EXIT WEBDRIVER
|
||||||
driver.quit();
|
driver.quit();
|
||||||
}());
|
}());
|
Reference in a new issue